================= House Price Prediction - Regression ==================

Import Dataset

library(plyr) #must import plyr first than dplyr - confliction issue
library(dplyr) #dplyr pipelines
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret) #cv / model train / BoxCoxtrans
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(ggplot2) #ggplot - visualization
library(gridExtra) #grid.arrange
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(tidyr)
library(purrr) #keep
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
## 
##     lift
## The following object is masked from 'package:plyr':
## 
##     compact
library(randomForest) #random forest
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(corrplot) #corrplot
## corrplot 0.84 loaded
library(e1071) #skewness / kurtosis
library(car) #vif
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
library(glmnet) #ridge/lasso/elastic net
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded glmnet 2.0-18
library(xgboost) #xgboost
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
#setting address
#setwd("~/Desktop/Practice/Kaggle/Housing/HousePrices/Datasets")

#import datasets
train <- read.csv("Datasets/train.csv", stringsAsFactors = FALSE)
test <- read.csv("Datasets/test.csv", stringsAsFactors = FALSE)

#creating response variable in test set
test$SalePrice <- NA

#combine train and test
dat <- rbind(train,test)

#remove ID
dat <- dat %>% subset(select = -c(Id))

#convert the MSSubClass as factor - type of dwelling
dat <- dat %>% mutate(MSSubClass = as.factor(MSSubClass))

#summary
dat %>% mutate_if(is.character, as.factor) %>% summary
##    MSSubClass      MSZoning     LotFrontage        LotArea      
##  20     :1079   C (all):  25   Min.   : 21.00   Min.   :  1300  
##  60     : 575   FV     : 139   1st Qu.: 59.00   1st Qu.:  7478  
##  50     : 287   RH     :  26   Median : 68.00   Median :  9453  
##  120    : 182   RL     :2265   Mean   : 69.31   Mean   : 10168  
##  30     : 139   RM     : 460   3rd Qu.: 80.00   3rd Qu.: 11570  
##  70     : 128   NA's   :   4   Max.   :313.00   Max.   :215245  
##  (Other): 529                  NA's   :486                      
##   Street      Alley      LotShape   LandContour  Utilities   
##  Grvl:  12   Grvl: 120   IR1: 968   Bnk: 117    AllPub:2916  
##  Pave:2907   Pave:  78   IR2:  76   HLS: 120    NoSeWa:   1  
##              NA's:2721   IR3:  16   Low:  60    NA's  :   2  
##                          Reg:1859   Lvl:2622                 
##                                                              
##                                                              
##                                                              
##    LotConfig    LandSlope   Neighborhood    Condition1     Condition2  
##  Corner : 511   Gtl:2778   NAmes  : 443   Norm   :2511   Norm   :2889  
##  CulDSac: 176   Mod: 125   CollgCr: 267   Feedr  : 164   Feedr  :  13  
##  FR2    :  85   Sev:  16   OldTown: 239   Artery :  92   Artery :   5  
##  FR3    :  14              Edwards: 194   RRAn   :  50   PosA   :   4  
##  Inside :2133              Somerst: 182   PosN   :  39   PosN   :   4  
##                            NridgHt: 166   RRAe   :  28   RRNn   :   2  
##                            (Other):1428   (Other):  35   (Other):   2  
##    BldgType      HouseStyle    OverallQual      OverallCond   
##  1Fam  :2425   1Story :1471   Min.   : 1.000   Min.   :1.000  
##  2fmCon:  62   2Story : 872   1st Qu.: 5.000   1st Qu.:5.000  
##  Duplex: 109   1.5Fin : 314   Median : 6.000   Median :5.000  
##  Twnhs :  96   SLvl   : 128   Mean   : 6.089   Mean   :5.565  
##  TwnhsE: 227   SFoyer :  83   3rd Qu.: 7.000   3rd Qu.:6.000  
##                2.5Unf :  24   Max.   :10.000   Max.   :9.000  
##                (Other):  27                                   
##    YearBuilt     YearRemodAdd    RoofStyle       RoofMatl   
##  Min.   :1872   Min.   :1950   Flat   :  20   CompShg:2876  
##  1st Qu.:1954   1st Qu.:1965   Gable  :2310   Tar&Grv:  23  
##  Median :1973   Median :1993   Gambrel:  22   WdShake:   9  
##  Mean   :1971   Mean   :1984   Hip    : 551   WdShngl:   7  
##  3rd Qu.:2001   3rd Qu.:2004   Mansard:  11   ClyTile:   1  
##  Max.   :2010   Max.   :2010   Shed   :   5   Membran:   1  
##                                               (Other):   2  
##   Exterior1st    Exterior2nd     MasVnrType     MasVnrArea     ExterQual
##  VinylSd:1025   VinylSd:1014   BrkCmn :  25   Min.   :   0.0   Ex: 107  
##  MetalSd: 450   MetalSd: 447   BrkFace: 879   1st Qu.:   0.0   Fa:  35  
##  HdBoard: 442   HdBoard: 406   None   :1742   Median :   0.0   Gd: 979  
##  Wd Sdng: 411   Wd Sdng: 391   Stone  : 249   Mean   : 102.2   TA:1798  
##  Plywood: 221   Plywood: 270   NA's   :  24   3rd Qu.: 164.0            
##  (Other): 369   (Other): 390                  Max.   :1600.0            
##  NA's   :   1   NA's   :   1                  NA's   :23                
##  ExterCond  Foundation   BsmtQual    BsmtCond    BsmtExposure BsmtFinType1
##  Ex:  12   BrkTil: 311   Ex  : 258   Fa  : 104   Av  : 418    ALQ :429    
##  Fa:  67   CBlock:1235   Fa  :  88   Gd  : 122   Gd  : 276    BLQ :269    
##  Gd: 299   PConc :1308   Gd  :1209   Po  :   5   Mn  : 239    GLQ :849    
##  Po:   3   Slab  :  49   TA  :1283   TA  :2606   No  :1904    LwQ :154    
##  TA:2538   Stone :  11   NA's:  81   NA's:  82   NA's:  82    Rec :288    
##            Wood  :   5                                        Unf :851    
##                                                               NA's: 79    
##    BsmtFinSF1     BsmtFinType2   BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   ALQ :  52    Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   BLQ :  68    1st Qu.:   0.00   1st Qu.: 220.0  
##  Median : 368.5   GLQ :  34    Median :   0.00   Median : 467.0  
##  Mean   : 441.4   LwQ :  87    Mean   :  49.58   Mean   : 560.8  
##  3rd Qu.: 733.0   Rec : 105    3rd Qu.:   0.00   3rd Qu.: 805.5  
##  Max.   :5644.0   Unf :2493    Max.   :1526.00   Max.   :2336.0  
##  NA's   :1        NA's:  80    NA's   :1         NA's   :1       
##   TotalBsmtSF      Heating     HeatingQC CentralAir Electrical  
##  Min.   :   0.0   Floor:   1   Ex:1493   N: 196     FuseA: 188  
##  1st Qu.: 793.0   GasA :2874   Fa:  92   Y:2723     FuseF:  50  
##  Median : 989.5   GasW :  27   Gd: 474              FuseP:   8  
##  Mean   :1051.8   Grav :   9   Po:   3              Mix  :   1  
##  3rd Qu.:1302.0   OthW :   2   TA: 857              SBrkr:2671  
##  Max.   :6110.0   Wall :   6                        NA's :   1  
##  NA's   :1                                                      
##    X1stFlrSF      X2ndFlrSF       LowQualFinSF        GrLivArea   
##  Min.   : 334   Min.   :   0.0   Min.   :   0.000   Min.   : 334  
##  1st Qu.: 876   1st Qu.:   0.0   1st Qu.:   0.000   1st Qu.:1126  
##  Median :1082   Median :   0.0   Median :   0.000   Median :1444  
##  Mean   :1160   Mean   : 336.5   Mean   :   4.694   Mean   :1501  
##  3rd Qu.:1388   3rd Qu.: 704.0   3rd Qu.:   0.000   3rd Qu.:1744  
##  Max.   :5095   Max.   :2065.0   Max.   :1064.000   Max.   :5642  
##                                                                   
##   BsmtFullBath     BsmtHalfBath        FullBath        HalfBath     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.00000   Median :2.000   Median :0.0000  
##  Mean   :0.4299   Mean   :0.06136   Mean   :1.568   Mean   :0.3803  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :3.0000   Max.   :2.00000   Max.   :4.000   Max.   :2.0000  
##  NA's   :2        NA's   :2                                         
##   BedroomAbvGr   KitchenAbvGr   KitchenQual  TotRmsAbvGrd   
##  Min.   :0.00   Min.   :0.000   Ex  : 205   Min.   : 2.000  
##  1st Qu.:2.00   1st Qu.:1.000   Fa  :  70   1st Qu.: 5.000  
##  Median :3.00   Median :1.000   Gd  :1151   Median : 6.000  
##  Mean   :2.86   Mean   :1.045   TA  :1492   Mean   : 6.452  
##  3rd Qu.:3.00   3rd Qu.:1.000   NA's:   1   3rd Qu.: 7.000  
##  Max.   :8.00   Max.   :3.000               Max.   :15.000  
##                                                             
##    Functional     Fireplaces     FireplaceQu   GarageType    GarageYrBlt  
##  Typ    :2717   Min.   :0.0000   Ex  :  43   2Types :  23   Min.   :1895  
##  Min2   :  70   1st Qu.:0.0000   Fa  :  74   Attchd :1723   1st Qu.:1960  
##  Min1   :  65   Median :1.0000   Gd  : 744   Basment:  36   Median :1979  
##  Mod    :  35   Mean   :0.5971   Po  :  46   BuiltIn: 186   Mean   :1978  
##  Maj1   :  19   3rd Qu.:1.0000   TA  : 592   CarPort:  15   3rd Qu.:2002  
##  (Other):  11   Max.   :4.0000   NA's:1420   Detchd : 779   Max.   :2207  
##  NA's   :   2                                NA's   : 157   NA's   :159   
##  GarageFinish   GarageCars      GarageArea     GarageQual  GarageCond 
##  Fin : 719    Min.   :0.000   Min.   :   0.0   Ex  :   3   Ex  :   3  
##  RFn : 811    1st Qu.:1.000   1st Qu.: 320.0   Fa  : 124   Fa  :  74  
##  Unf :1230    Median :2.000   Median : 480.0   Gd  :  24   Gd  :  15  
##  NA's: 159    Mean   :1.767   Mean   : 472.9   Po  :   5   Po  :  14  
##               3rd Qu.:2.000   3rd Qu.: 576.0   TA  :2604   TA  :2654  
##               Max.   :5.000   Max.   :1488.0   NA's: 159   NA's: 159  
##               NA's   :1       NA's   :1                               
##  PavedDrive   WoodDeckSF       OpenPorchSF     EnclosedPorch   
##  N: 216     Min.   :   0.00   Min.   :  0.00   Min.   :   0.0  
##  P:  62     1st Qu.:   0.00   1st Qu.:  0.00   1st Qu.:   0.0  
##  Y:2641     Median :   0.00   Median : 26.00   Median :   0.0  
##             Mean   :  93.71   Mean   : 47.49   Mean   :  23.1  
##             3rd Qu.: 168.00   3rd Qu.: 70.00   3rd Qu.:   0.0  
##             Max.   :1424.00   Max.   :742.00   Max.   :1012.0  
##                                                                
##    X3SsnPorch       ScreenPorch        PoolArea        PoolQC    
##  Min.   :  0.000   Min.   :  0.00   Min.   :  0.000   Ex  :   4  
##  1st Qu.:  0.000   1st Qu.:  0.00   1st Qu.:  0.000   Fa  :   2  
##  Median :  0.000   Median :  0.00   Median :  0.000   Gd  :   4  
##  Mean   :  2.602   Mean   : 16.06   Mean   :  2.252   NA's:2909  
##  3rd Qu.:  0.000   3rd Qu.:  0.00   3rd Qu.:  0.000              
##  Max.   :508.000   Max.   :576.00   Max.   :800.000              
##                                                                  
##    Fence      MiscFeature    MiscVal             MoSold      
##  GdPrv: 118   Gar2:   5   Min.   :    0.00   Min.   : 1.000  
##  GdWo : 112   Othr:   4   1st Qu.:    0.00   1st Qu.: 4.000  
##  MnPrv: 329   Shed:  95   Median :    0.00   Median : 6.000  
##  MnWw :  12   TenC:   1   Mean   :   50.83   Mean   : 6.213  
##  NA's :2348   NA's:2814   3rd Qu.:    0.00   3rd Qu.: 8.000  
##                           Max.   :17000.00   Max.   :12.000  
##                                                              
##      YrSold        SaleType    SaleCondition    SalePrice     
##  Min.   :2006   WD     :2525   Abnorml: 190   Min.   : 34900  
##  1st Qu.:2007   New    : 239   AdjLand:  12   1st Qu.:129975  
##  Median :2008   COD    :  87   Alloca :  24   Median :163000  
##  Mean   :2008   ConLD  :  26   Family :  46   Mean   :180921  
##  3rd Qu.:2009   CWD    :  12   Normal :2402   3rd Qu.:214000  
##  Max.   :2010   (Other):  29   Partial: 245   Max.   :755000  
##                 NA's   :   1                  NA's   :1459

Sale Price Density

#Visualization for our target variable 
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=SalePrice)) +
  geom_density(fill="green", alpha=0.3)+
  geom_vline(xintercept = mean(dat$SalePrice,na.rm=TRUE), show.legend = TRUE, color = "red")+
  geom_vline(xintercept = median(dat$SalePrice, na.rm=TRUE), show.legend = TRUE, color = "blue")+
  labs(title = "Density for Sales Price", x="Sale Price", y="Density")

#Quite right skewed distribution

summary(dat$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   34900  129975  163000  180921  214000  755000    1459

Dealing with NA values

#Let's see NA values
NAtable <- data.frame(dat %>% summarise_all(list(~sum(is.na(.)))))

NAtable <- NAtable[,which(NAtable>0)]
NAtable <- NAtable %>% subset(select = -c(SalePrice))
dim(NAtable)
## [1]  1 34
#total 34 predictors have at least 1 missing values (NA)
NAtable <- NAtable[order(NAtable, decreasing=TRUE)]

NAtable
##   PoolQC MiscFeature Alley Fence FireplaceQu LotFrontage GarageYrBlt
## 1   2909        2814  2721  2348        1420         486         159
##   GarageFinish GarageQual GarageCond GarageType BsmtCond BsmtExposure
## 1          159        159        159        157       82           82
##   BsmtQual BsmtFinType2 BsmtFinType1 MasVnrType MasVnrArea MSZoning
## 1       81           80           79         24         23        4
##   Utilities BsmtFullBath BsmtHalfBath Functional Exterior1st Exterior2nd
## 1         2            2            2          2           1           1
##   BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Electrical KitchenQual
## 1          1          1         1           1          1           1
##   GarageCars GarageArea SaleType
## 1          1          1        1
#notice 
#Alley has 2721 NA
#FireplaceQu 1420 NA
#PoolQC 2909 NA
#Fence 2348 NA
#MiscFeature 2814 NA
#out of total 2919 observations

#I'm going to deal with those predictors that have NA values first

Alley NA values

summary(as.factor(dat$Alley))
## Grvl Pave NA's 
##  120   78 2721
#Alley: Type of alley access to property
#Grvl : Gravel
#Pave : Paved
#NA : no alley access

dat$Alley[is.na(dat$Alley)] <- "no alley"
dat$Alley[dat$Alley=="Grvl"] <- "Gravel"
dat$Alley[dat$Alley=="Pave"] <- "Paved"

summary(as.factor(dat$Alley))
##   Gravel no alley    Paved 
##      120     2721       78

FireplaceQu NA values

summary(as.factor(dat$FireplaceQu))
##   Ex   Fa   Gd   Po   TA NA's 
##   43   74  744   46  592 1420
#FireplaceQu: Fireplace quality
#Ex : Excellent
#Fa : Fair
#Gd : Good
#Po : Poor
#TA : Average
#NA : no fire place

dat$FireplaceQu[is.na(dat$FireplaceQu)] <- "no fireplace"


summary(as.factor(dat$FireplaceQu))
##           Ex           Fa           Gd no fireplace           Po 
##           43           74          744         1420           46 
##           TA 
##          592

PoolQC NA values

summary(as.factor(dat$PoolQC))
##   Ex   Fa   Gd NA's 
##    4    2    4 2909
#PoolQC: Pool quality
#NA : no pool

dat$PoolQC[is.na(dat$PoolQC)] <- "no pool"

summary(as.factor(dat$PoolQC))
##      Ex      Fa      Gd no pool 
##       4       2       4    2909

Fence NA values

summary(as.factor(dat$Fence))
## GdPrv  GdWo MnPrv  MnWw  NA's 
##   118   112   329    12  2348
#Fence: Fence quality
#NA : no fence

dat$Fence[is.na(dat$Fence)] <- "no fence"
summary(as.factor(dat$Fence))
##    GdPrv     GdWo    MnPrv     MnWw no fence 
##      118      112      329       12     2348

MiscFeature NA values

summary(as.factor(dat$MiscFeature))
## Gar2 Othr Shed TenC NA's 
##    5    4   95    1 2814
#MiscFeature: Miscellaneous feature not covered in other categories
#NA : None

dat$MiscFeature[is.na(dat$MiscFeature)] <- "None"
summary(as.factor(dat$MiscFeature))
## Gar2 None Othr Shed TenC 
##    5 2814    4   95    1

Creating Functions

#num var vs factors varaibles
#Finding significant factor variables with numeric variables by lm - anova, 
#removing the insignificant factor varibles, 
#then implementing variables importance by random forest with those significant variables

#This function is created to fill the NA values 
#with the group mean values of the numeric variables 
#by the most signifcant factor variables

relationship.test.num <- function(test.var, data){
  
  #only selecting factor variables
  variables <- data[,!colnames(data) %in% c(test.var)] %>% 
    mutate_if(is.character, as.factor) %>% 
    select_if(is.factor) %>% 
    colnames
  
  #creating formula for anova
  formula <- as.formula(paste(test.var, "~", paste(variables, collapse = "+")))
  
  #First do anova test to get factor variables
  aov <- aov(formula, dat %>% mutate_if(is.character, as.factor))
  
  #removing variables from anova test to make simple
  variables1 <- 
    variables[variables %in% 
                gsub(" ", "",
                     rownames(summary(aov)[[1]])[which(summary(aov)[[1]][["Pr(>F)"]]<0.05)])]
  #less than 0.05 p values from anova

  #formula with new variables
  formula1 <- as.formula(paste(test.var, "~", paste(variables1, collapse = "+")))

  #a quick random forest for variable importance, test.var as response variables, 
  a <- randomForest(formula1, data %>% 
                      filter(complete.cases(.),) %>% 
                      mutate_if(is.character, as.factor))
  
  return(importance(a))
  
}

#which return the variable importance



#This function is created to see the relationship between different types of variables

#For factor vs factor : 
#implement chisq test : 
#Null H0 = there is no significant difference between observed and expected values
#which means each variables are independent 

#For factor vs numeric :
#anova test which is the same with the created function, relationship.test.num, above
#Null H0 = at least a group mean is different than other group means
#association between factor and numeric variables

#For numeric vs numeric :
#correlation : if correlation value is greater than 0.5, 
#it is generally considered as high correlation between each other

relationship.test <- function(variables, dummy.data, data){

  for(i in variables){
    for(j in variables){
      
      #factor vs factor : chisq.test
      if(is.factor(data[,i])){
        if(is.factor(data[,j])){
          dummy.data[dummy.data$cols == i,j] <- 
            round(chisq.test(data[,i], data[,j], simulate.p.value = TRUE)$p.value,3)
          }
      }
      #returns p value from chisq test
      
      
      #factor vs numeric : anova
      if(is.factor(data[,i])){
        if(is.numeric(data[,j])){
          dummy.data[dummy.data$cols == i,j] <-
            round(summary(aov(data[,j]~data[,i]))[[1]][["Pr(>F)"]][[1]],3)
          }
      }
      if(is.numeric(data[,i])){
        if(is.factor(data[,j])){
          dummy.data[dummy.data$cols == i,j] <- 
            round(summary(aov(data[,i]~data[,j]))[[1]][["Pr(>F)"]][[1]],3)
          }
      }
      #returns p value from anova test
    
      
      
      
      #numeric vs numeric : correlation
      if(is.numeric(data[,i])){
        if(is.numeric(data[,j])){
          dummy.data[dummy.data$cols == i,j] <- round(cor(data[,i], data[,j]),3)
          }
      }
      
      #returns correlation values
    }
    
  }
  
  return(dummy.data)
}
#returns a table for the relationship between each variables



#Prop table
#This function is created to see the proportion of each factor levels
prop.func <- function(var){
  return(prop.table(matrix(table(dat[,var]))))
}



#Year to decade
#This function converts the year to the decade of year
floor_decade <- function(year){
  return(year - year %% 10) 
}



#This function will show how a variable looks like and 
#how a variable has different shapes of distribution between in the train and in the test set
train.test.graph <- function(var, data){
  
  #for categorical variables, bar graphs
  if(class(data[,var]) == "character" | class(dat[,var]) == "factor"){
    var1 <- data[!is.na(data$SalePrice),var]
    var2 <- data[is.na(data$SalePrice),var]
    
    #train
    fac.train <- data %>% filter(!is.na(SalePrice)) %>%
      ggplot(aes(x=var1, fill=var1)) + geom_bar() + labs(title = "Train", x = var)
    #test
    fac.test <- data %>% filter(is.na(SalePrice)) %>%
      ggplot(aes(x=var2, fill=var2)) + geom_bar() + labs(title = "Test", x = var)
  
  return(grid.arrange(fac.train, fac.test))
  }
  
  #for continuous variables, density graphs
  if(class(data[,var]) == "numeric" | class(data[,var]) == "integer"){
    var1 <- data[!is.na(data$SalePrice),var]
    var2 <- data[is.na(data$SalePrice),var]
    
    #train
    num.train <- data %>% filter(!is.na(SalePrice)) %>%
      ggplot(aes(x=var1)) + geom_density(fill="blue")  + labs(title = "Train", x = var)
    #test
    num.test <- data %>% filter(is.na(SalePrice)) %>%
      ggplot(aes(x=var2)) + geom_density(fill="blue")  + labs(title = "Test", x = var)
    
    return(grid.arrange(num.train, num.test))
  }

}



#Conversion ordinal to continuous
#giving a continuous value in each factor levels
ordTonum <- function(var,set){
  var <- as.character(var)
  var <- revalue(var,set)
  return(as.numeric(var))
}

LotFrontage NA values

#LotFrontage: Linear feet of street connected to property
dat$LotFrontage %>% summary
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   59.00   68.00   69.31   80.00  313.00     486
dat %>% filter(!is.na(SalePrice)) %>% select(LotFrontage) %>% summary
##   LotFrontage    
##  Min.   : 21.00  
##  1st Qu.: 59.00  
##  Median : 69.00  
##  Mean   : 70.05  
##  3rd Qu.: 80.00  
##  Max.   :313.00  
##  NA's   :259
dat %>% filter(is.na(SalePrice)) %>% select(LotFrontage) %>% summary
##   LotFrontage    
##  Min.   : 21.00  
##  1st Qu.: 58.00  
##  Median : 67.00  
##  Mean   : 68.58  
##  3rd Qu.: 80.00  
##  Max.   :200.00  
##  NA's   :227
train.test.graph("LotFrontage", dat %>% filter(!is.na(LotFrontage)))

#The graphs shows LotFrontage has different shape of distribution in train and test set


#Let's find the most important variable with LotFrontage
relationship.test.num("LotFrontage", dat %>% 
                        filter(!is.na(SalePrice) & !is.na(LotFrontage)) %>%
                        subset(select = -c(YearBuilt, YearRemodAdd, GarageYrBlt,
                                           Utilities, MoSold, YrSold)) %>% 
                        mutate_if(is.character, as.factor))
##              IncNodePurity
## MSSubClass      102957.084
## MSZoning         18688.208
## Street            1288.341
## Alley             2336.358
## LotShape         36044.329
## LandContour      10015.535
## LotConfig        39998.947
## Neighborhood     83381.971
## Condition1       28776.194
## Condition2        1258.002
## BldgType         63935.140
## RoofStyle        10438.802
## RoofMatl         35004.918
## Exterior2nd      30027.687
## MasVnrType        9889.772
## ExterQual         9524.076
## Foundation        9286.946
## BsmtQual         15087.529
## BsmtExposure     13849.877
## Functional        5040.911
## FireplaceQu      18048.082
## GarageType       27753.924
## PoolQC           19641.178
#MSSubclass is the most important one with Lotfrontage
#Neighborhood and BldgType have high variable importance as well

lotfront <- dat %>% filter(!is.na(LotFrontage)) %>%
  mutate_if(is.character, as.factor)

lotfront %>%
  ggplot(aes(x=MSSubClass, y=LotFrontage, fill=MSSubClass)) +
  geom_boxplot()

lotfront %>%
  group_by(MSSubClass) %>%
  summarise(count = n(),
            mean = mean(LotFrontage),
            median = median(LotFrontage),
            min = min(LotFrontage),
            max = max(LotFrontage))
## # A tibble: 15 x 6
##    MSSubClass count  mean median   min   max
##    <fct>      <int> <dbl>  <dbl> <int> <int>
##  1 20           894  77.7     75    36   313
##  2 30           126  61.0     60    30   153
##  3 40             5  53.4     55    40    62
##  4 45            18  55.6     55    40    85
##  5 50           262  63.2     60    40   152
##  6 60           442  79.0     75    41   313
##  7 70           117  64.2     60    34   144
##  8 75            21  74.5     65    35   174
##  9 80            85  78.6     78    37   140
## 10 85            32  73.4     72    50   150
## 11 90            92  70.8     70    33   120
## 12 120          150  44.7     43    22   135
## 13 160          116  26.9     24    21    75
## 14 180           16  26.2     21    21    35
## 15 190           57  68.2     60    33   195
lot.d1 <- data.frame(dat %>% filter(!is.na(LotFrontage)) %>% group_by(MSSubClass) %>% tally())

lot.d2 <- data.frame(dat %>% filter(is.na(LotFrontage)) %>% group_by(MSSubClass) %>% tally())

lot.d1$MSSubClass %in% lot.d2$MSSubClass
##  [1]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [12]  TRUE  TRUE  TRUE  TRUE
m1 <- data.frame(merge(lot.d1, lot.d2, by="MSSubClass", all=TRUE))
m1
##    MSSubClass n.x n.y
## 1          20 894 185
## 2          30 126  13
## 3          40   5   1
## 4          45  18  NA
## 5          50 262  25
## 6          60 442 133
## 7          70 117  11
## 8          75  21   2
## 9          80  85  33
## 10         85  32  16
## 11         90  92  17
## 12        120 150  32
## 13        150  NA   1
## 14        160 116  12
## 15        180  16   1
## 16        190  57   4
#LotArea..
dat %>% filter(!is.na(LotFrontage)) %>% 
  ggplot(aes(x=LotFrontage, y=log(LotArea+1))) + geom_jitter()

#quick randomforest

rf.lot <- randomForest(LotFrontage ~.,
             data = dat %>% 
               filter(complete.cases(.)) %>% 
               mutate_if(is.character, as.factor))
rf.lot
## 
## Call:
##  randomForest(formula = LotFrontage ~ ., data = dat %>% filter(complete.cases(.)) %>%      mutate_if(is.character, as.factor)) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 26
## 
##           Mean of squared residuals: 283.9752
##                     % Var explained: 52.68
importance(rf.lot) #LotArea , MSSubClass, Neighborhood, BldgType
##               IncNodePurity
## MSSubClass      63221.21440
## MSZoning         7110.98840
## LotArea        147552.10191
## Street            247.60762
## Alley             321.64063
## LotShape        15015.82871
## LandContour      4058.63681
## Utilities           0.00000
## LotConfig       28318.66367
## LandSlope         758.09504
## Neighborhood    44056.32037
## Condition1      16528.84729
## Condition2        219.83946
## BldgType        33432.62833
## HouseStyle       3597.11633
## OverallQual      2179.48377
## OverallCond      1896.54740
## YearBuilt        8937.02160
## YearRemodAdd     4279.36921
## RoofStyle        3710.89859
## RoofMatl        25083.18129
## Exterior1st      6453.50002
## Exterior2nd      7286.89080
## MasVnrType       1290.50907
## MasVnrArea       3201.87373
## ExterQual        1364.33006
## ExterCond         488.44810
## Foundation       2653.65731
## BsmtQual         1995.08026
## BsmtCond         1214.79830
## BsmtExposure     2485.26759
## BsmtFinType1     4164.56276
## BsmtFinSF1      12588.57465
## BsmtFinType2     1909.18142
## BsmtFinSF2       1169.24898
## BsmtUnfSF        6205.05533
## TotalBsmtSF     17655.73572
## Heating           264.49876
## HeatingQC        1190.84288
## CentralAir        157.91295
## Electrical        339.69516
## X1stFlrSF       19299.26771
## X2ndFlrSF        4202.86188
## LowQualFinSF      194.81284
## GrLivArea       16621.56546
## BsmtFullBath     1104.19854
## BsmtHalfBath      194.15749
## FullBath          887.72407
## HalfBath          664.69432
## BedroomAbvGr     1922.84023
## KitchenAbvGr      195.72842
## KitchenQual      1835.42205
## TotRmsAbvGrd     3516.02926
## Functional       1546.46216
## Fireplaces       1471.11023
## FireplaceQu      3300.04164
## GarageType      10016.16905
## GarageYrBlt      5104.46828
## GarageFinish     3057.35464
## GarageCars       1966.59184
## GarageArea      18278.94730
## GarageQual        679.54179
## GarageCond        125.69002
## PavedDrive        712.77794
## WoodDeckSF       3797.57959
## OpenPorchSF      5094.69892
## EnclosedPorch    4377.24833
## X3SsnPorch        500.96121
## ScreenPorch       729.12273
## PoolArea         1393.28535
## PoolQC           8802.51986
## Fence            1658.31316
## MiscFeature        63.34413
## MiscVal            75.68610
## MoSold           4001.60073
## YrSold           2223.78872
## SaleType          781.03235
## SaleCondition    2647.51369
## SalePrice       12812.48044
#creating new train set only for predicting lotfrontage 
lot.dat <- dat %>% 
  filter(!is.na(LotFrontage)) %>%
  mutate_if(is.character, as.factor) %>%
  select(LotFrontage, LotArea, MSSubClass, Neighborhood)
  
#test set
lot.dat1 <- dat %>% 
  filter(is.na(LotFrontage)) %>%
  mutate_if(is.character, as.factor) %>%
  select(LotFrontage, LotArea, MSSubClass, Neighborhood)

lot.dat1$Neighborhood %>% levels()
##  [1] "Blmngtn" "BrkSide" "ClearCr" "CollgCr" "Crawfor" "Edwards" "Gilbert"
##  [8] "IDOTRR"  "MeadowV" "Mitchel" "NAmes"   "NoRidge" "NPkVill" "NridgHt"
## [15] "NWAmes"  "OldTown" "Sawyer"  "SawyerW" "Somerst" "StoneBr" "SWISU"  
## [22] "Timber"  "Veenker"
levels(lot.dat$Neighborhood) %in% levels(lot.dat1$Neighborhood)
##  [1]  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [12]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [23]  TRUE  TRUE  TRUE
#some levels of Neighborhood are not in the test set

lot.dat <- lot.dat[lot.dat$Neighborhood %in% levels(lot.dat1$Neighborhood),] 

lot.dat$Neighborhood <- as.factor(as.character(lot.dat$Neighborhood))

#random forest with LotArea, MSSubClass, and Neighborhood
rf.lot <- randomForest(LotFrontage ~ LotArea+MSSubClass+Neighborhood,
             data = lot.dat)

rf.lot
## 
## Call:
##  randomForest(formula = LotFrontage ~ LotArea + MSSubClass + Neighborhood,      data = lot.dat) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 249.1706
##                     % Var explained: 51.82
lot.dat %>% str
## 'data.frame':    2393 obs. of  4 variables:
##  $ LotFrontage : int  65 80 68 60 84 85 75 51 50 70 ...
##  $ LotArea     : int  8450 9600 11250 9550 14260 14115 10084 6120 7420 11200 ...
##  $ MSSubClass  : Factor w/ 16 levels "20","30","40",..: 6 1 6 7 6 5 1 5 16 1 ...
##  $ Neighborhood: Factor w/ 23 levels "Blmngtn","BrkSide",..: 4 23 4 5 12 10 19 16 2 17 ...
lot.dat1 %>% str
## 'data.frame':    486 obs. of  4 variables:
##  $ LotFrontage : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ LotArea     : int  10382 12968 10920 11241 8246 8544 9180 9200 13869 9375 ...
##  $ MSSubClass  : Factor w/ 16 levels "20","30","40",..: 6 1 1 1 1 1 10 1 6 6 ...
##  $ Neighborhood: Factor w/ 23 levels "Blmngtn","BrkSide",..: 15 17 11 11 17 17 18 4 7 4 ...
rf.pred <- predict(rf.lot, lot.dat1)


#replace NA values in LotFrontage with predicted values from random forest
dat$LotFrontage[is.na(dat$LotFrontage)] <- rf.pred



summary(dat$LotFrontage)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   60.00   70.00   70.06   80.00  313.00
#no NA value
cor(log(dat$SalePrice[!is.na(dat$SalePrice)]), log(dat$LotArea[!is.na(dat$SalePrice)]))
## [1] 0.3999177
cor(log(dat$SalePrice[!is.na(dat$SalePrice)]), log(dat$LotFrontage[!is.na(dat$SalePrice)]))
## [1] 0.3542928
train.test.graph("LotFrontage", dat %>% filter(!is.na(LotFrontage)))

Garage Factor variables NA values

dat %>% select(contains("Garage")) %>% mutate_if(is.character, as.factor) %>% summary
##    GarageType    GarageYrBlt   GarageFinish   GarageCars   
##  2Types :  23   Min.   :1895   Fin : 719    Min.   :0.000  
##  Attchd :1723   1st Qu.:1960   RFn : 811    1st Qu.:1.000  
##  Basment:  36   Median :1979   Unf :1230    Median :2.000  
##  BuiltIn: 186   Mean   :1978   NA's: 159    Mean   :1.767  
##  CarPort:  15   3rd Qu.:2002                3rd Qu.:2.000  
##  Detchd : 779   Max.   :2207                Max.   :5.000  
##  NA's   : 157   NA's   :159                 NA's   :1      
##    GarageArea     GarageQual  GarageCond 
##  Min.   :   0.0   Ex  :   3   Ex  :   3  
##  1st Qu.: 320.0   Fa  : 124   Fa  :  74  
##  Median : 480.0   Gd  :  24   Gd  :  15  
##  Mean   : 472.9   Po  :   5   Po  :  14  
##  3rd Qu.: 576.0   TA  :2604   TA  :2654  
##  Max.   :1488.0   NA's: 159   NA's: 159  
##  NA's   :1
#Factor: GarageType / GarageYrBlt / GarageFinish / GarageQual / GarageCond
#Numeric: GarageCars / GarageArea

#number of NA values are different 
#Let's first fill the NA values of the factor Garage variable that has the least NA values, which is Garage Type

garage.fac.var <- dat %>% 
  select(contains("Garage")) %>% 
  mutate_if(is.character, as.factor) %>% 
  select_if(is.factor) %>% 
  colnames

dat %>% select(contains("Garage")) %>% filter(is.na(GarageArea)) %>% head
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Detchd          NA         <NA>         NA         NA       <NA>
##   GarageCond
## 1       <NA>
dat %>% select(contains("Garage")) %>% filter(is.na(GarageCars)) %>% head
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Detchd          NA         <NA>         NA         NA       <NA>
##   GarageCond
## 1       <NA>
dat %>% select(contains("Garage")) %>% filter(!is.na(GarageType) & is.na(GarageQual)) %>% head
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Detchd          NA         <NA>          1        360       <NA>
## 2     Detchd          NA         <NA>         NA         NA       <NA>
##   GarageCond
## 1       <NA>
## 2       <NA>
dat %>% select(contains("Garage")) %>% filter(!is.na(GarageType) & is.na(GarageCond)) %>% head
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Detchd          NA         <NA>          1        360       <NA>
## 2     Detchd          NA         <NA>         NA         NA       <NA>
##   GarageCond
## 1       <NA>
## 2       <NA>
#GarageType : Detchd = Detached from home (might be street parking)

dat$GarageType[is.na(dat$GarageType)]
##   [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##  [24] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##  [47] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##  [70] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##  [93] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [116] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [139] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
dat$GarageType[is.na(dat$GarageType)] <- "no garage"

dat %>% 
  select(contains("Garage")) %>% 
  filter(GarageType == "Detchd" & !is.na(GarageQual)) %>% 
  mutate_if(is.character, as.factor) %>%
  summary
##   GarageType   GarageYrBlt   GarageFinish   GarageCars      GarageArea    
##  Detchd:777   Min.   :1895   Fin: 24      Min.   :1.000   Min.   : 100.0  
##               1st Qu.:1939   RFn: 34      1st Qu.:1.000   1st Qu.: 280.0  
##               Median :1962   Unf:719      Median :2.000   Median : 400.0  
##               Mean   :1961                Mean   :1.548   Mean   : 419.6  
##               3rd Qu.:1981                3rd Qu.:2.000   3rd Qu.: 528.0  
##               Max.   :2009                Max.   :5.000   Max.   :1488.0  
##  GarageQual GarageCond
##  Ex:  3     Ex:  3    
##  Fa: 97     Fa: 67    
##  Gd:  5     Gd:  3    
##  Po:  5     Po: 12    
##  TA:667     TA:692    
## 
dat %>% 
  select(contains("Garage")) %>% 
  filter(GarageType == "Detchd" & is.na(GarageQual)) %>% 
  mutate_if(is.character, as.factor) %>%
  summary
##   GarageType  GarageYrBlt  GarageFinish   GarageCars   GarageArea 
##  Detchd:2    Min.   : NA   NA's:2       Min.   :1    Min.   :360  
##              1st Qu.: NA                1st Qu.:1    1st Qu.:360  
##              Median : NA                Median :1    Median :360  
##              Mean   :NaN                Mean   :1    Mean   :360  
##              3rd Qu.: NA                3rd Qu.:1    3rd Qu.:360  
##              Max.   : NA                Max.   :1    Max.   :360  
##              NA's   :2                  NA's   :1    NA's   :1    
##  GarageQual GarageCond
##  NA's:2     NA's:2    
##                       
##                       
##                       
##                       
##                       
## 
#GarageYrBlt in GarageType "Detchd"
dat$GarageYrBlt[is.na(dat$GarageYrBlt) & dat$GarageType == "Detchd"] <- median(dat$GarageYrBlt[dat$GarageType == "Detchd"], na.rm=TRUE)

#GarageFinish = Unf
dat$GarageFinish[is.na(dat$GarageFinish) & dat$GarageType == "Detchd"] <- "Unf"

#GarageQual and GarageCond = TA
dat$GarageQual[is.na(dat$GarageQual) & dat$GarageType == "Detchd"] <- "TA"
dat$GarageCond[is.na(dat$GarageCond) & dat$GarageType == "Detchd"] <- "TA"

dat %>% 
  select(contains("Garage")) %>% 
  filter(GarageType == "Detchd" & is.na(GarageCars)) %>% 
  mutate_if(is.character, as.factor) %>%
  summary
##   GarageType  GarageYrBlt   GarageFinish   GarageCars    GarageArea 
##  Detchd:1    Min.   :1962   Unf:1        Min.   : NA   Min.   : NA  
##              1st Qu.:1962                1st Qu.: NA   1st Qu.: NA  
##              Median :1962                Median : NA   Median : NA  
##              Mean   :1962                Mean   :NaN   Mean   :NaN  
##              3rd Qu.:1962                3rd Qu.: NA   3rd Qu.: NA  
##              Max.   :1962                Max.   : NA   Max.   : NA  
##                                          NA's   :1     NA's   :1    
##  GarageQual GarageCond
##  TA:1       TA:1      
##                       
##                       
##                       
##                       
##                       
## 
dat %>% 
  select(contains("Garage")) %>% 
  filter(GarageType == "Detchd" & !is.na(GarageCars)) %>% 
  mutate_if(is.character, as.factor) %>%
  summary
##   GarageType   GarageYrBlt   GarageFinish   GarageCars      GarageArea    
##  Detchd:778   Min.   :1895   Fin: 24      Min.   :1.000   Min.   : 100.0  
##               1st Qu.:1939   RFn: 34      1st Qu.:1.000   1st Qu.: 280.0  
##               Median :1962   Unf:720      Median :2.000   Median : 399.5  
##               Mean   :1961                Mean   :1.548   Mean   : 419.5  
##               3rd Qu.:1981                3rd Qu.:2.000   3rd Qu.: 528.0  
##               Max.   :2009                Max.   :5.000   Max.   :1488.0  
##  GarageQual GarageCond
##  Ex:  3     Ex:  3    
##  Fa: 97     Fa: 67    
##  Gd:  5     Gd:  3    
##  Po:  5     Po: 12    
##  TA:668     TA:693    
## 
#GarageCars and GarageArea in GarageType Detchd = median value
dat$GarageCars[is.na(dat$GarageCars) & dat$GarageType == "Detchd"] <- median(dat$GarageCars[dat$GarageType == "Detchd"], na.rm=TRUE)
dat$GarageArea[is.na(dat$GarageArea) & dat$GarageType == "Detchd"] <- median(dat$GarageArea[dat$GarageType == "Detchd"], na.rm=TRUE)


#Fill in Other NA values 
dat$GarageYrBlt[is.na(dat$GarageYrBlt)] <- 0 #"no garage"
dat$GarageFinish[is.na(dat$GarageFinish)] <- "no garage"
dat$GarageQual[is.na(dat$GarageQual)] <- "no garage"
dat$GarageCond[is.na(dat$GarageCond)] <- "no garage"


dat %>% select(contains("Garage")) %>% mutate_if(is.character, as.factor) %>% summary
##      GarageType    GarageYrBlt      GarageFinish    GarageCars   
##  2Types   :  23   Min.   :   0   Fin      : 719   Min.   :0.000  
##  Attchd   :1723   1st Qu.:1957   no garage: 157   1st Qu.:1.000  
##  Basment  :  36   Median :1977   RFn      : 811   Median :2.000  
##  BuiltIn  : 186   Mean   :1872   Unf      :1232   Mean   :1.767  
##  CarPort  :  15   3rd Qu.:2001                    3rd Qu.:2.000  
##  Detchd   : 779   Max.   :2207                    Max.   :5.000  
##  no garage: 157                                                  
##    GarageArea         GarageQual       GarageCond  
##  Min.   :   0.0   Ex       :   3   Ex       :   3  
##  1st Qu.: 320.0   Fa       : 124   Fa       :  74  
##  Median : 480.0   Gd       :  24   Gd       :  15  
##  Mean   : 472.8   no garage: 157   no garage: 157  
##  3rd Qu.: 576.0   Po       :   5   Po       :  14  
##  Max.   :1488.0   TA       :2606   TA       :2656  
## 
#no NA value for Garage vars

Basement variables - factor variables

dat %>% select(contains("Bsmt")) %>%  
  mutate_if(is.character, as.factor) %>% 
  select_if(is.factor) %>% 
  summary
##  BsmtQual    BsmtCond    BsmtExposure BsmtFinType1 BsmtFinType2
##  Ex  : 258   Fa  : 104   Av  : 418    ALQ :429     ALQ :  52   
##  Fa  :  88   Gd  : 122   Gd  : 276    BLQ :269     BLQ :  68   
##  Gd  :1209   Po  :   5   Mn  : 239    GLQ :849     GLQ :  34   
##  TA  :1283   TA  :2606   No  :1904    LwQ :154     LwQ :  87   
##  NA's:  81   NA's:  82   NA's:  82    Rec :288     Rec : 105   
##                                       Unf :851     Unf :2493   
##                                       NA's: 79     NA's:  80
#BsmtFinType1 has the least number of NA values, so replace NA to no basement for BsmtFinType1 first

dat$BsmtFinType1[is.na(dat$BsmtFinType1)] <- "no basement"


#BsmtFinType2
dat %>% select(contains("Bsmt")) %>% filter(is.na(BsmtFinType2) & BsmtFinType1 != "no basement")
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1       Gd       TA           No          GLQ       1124         <NA>
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath
## 1        479      1603        3206            1            0
#BstmQual = Gd
#BsmtCond = TA
#BsmtFinType1 = GLQ

dat %>% 
  select(contains("Bsmt")) %>% 
  filter(BsmtQual == "Gd" & BsmtCond == "TA" & BsmtFinType1 == "GLQ") %>% 
  group_by(BsmtFinType2) %>% 
  summarise(count = n(),
            mean = mean(BsmtFinSF2))
## # A tibble: 6 x 3
##   BsmtFinType2 count     mean
##   <chr>        <int>    <dbl>
## 1 <NA>             1 479     
## 2 ALQ             11 347.    
## 3 BLQ              5 296     
## 4 LwQ              8 178.    
## 5 Rec              7 294.    
## 6 Unf            503   0.0119
dat %>%
  select(contains("Bsmt")) %>% 
  filter(BsmtQual == "Gd" & BsmtCond == "TA" & BsmtFinType1 == "GLQ" & is.na(BsmtFinType2))
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1       Gd       TA           No          GLQ       1124         <NA>
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath
## 1        479      1603        3206            1            0
#ALQ is the closest to the group mean of the BsmtFinSF2 for BsmtFinType2

for(i in 1:nrow(dat)){
  if(is.na(dat$BsmtFinType2[i])){
    if(dat$BsmtFinType1[i] == "no basement"){
      dat$BsmtFinType2[i] <- "no basement"
    }
    if(dat$BsmtFinType1[i] != "no basement"){
      dat$BsmtFinType2[i] <- "ALQ"
    }
  }
}

dat %>% select(contains("Bsmt")) %>% 
  filter(is.na(BsmtQual) & BsmtFinType1 == "no basement") %>% 
  head
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1     <NA>     <NA>         <NA>  no basement          0  no basement
## 2     <NA>     <NA>         <NA>  no basement          0  no basement
## 3     <NA>     <NA>         <NA>  no basement          0  no basement
## 4     <NA>     <NA>         <NA>  no basement          0  no basement
## 5     <NA>     <NA>         <NA>  no basement          0  no basement
## 6     <NA>     <NA>         <NA>  no basement          0  no basement
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath
## 1          0         0           0            0            0
## 2          0         0           0            0            0
## 3          0         0           0            0            0
## 4          0         0           0            0            0
## 5          0         0           0            0            0
## 6          0         0           0            0            0
#BestQual
for(i in 1:nrow(dat)){
  if(is.na(dat$BsmtQual[i])){
    if(dat$BsmtFinType1[i] == "no basement"){
      dat$BsmtQual[i] <- "no basement"
    }
    if(!is.na(dat$BsmtCond[i])){
      dat$BsmtQual[i] <- dat$BsmtCond[i]
    }
  }
}


#BestCond 

dat %>% select(contains("Bsmt")) %>% 
  filter(is.na(BsmtCond)) %>% 
  mutate_if(is.character, as.factor) %>% 
  summary
##         BsmtQual  BsmtCond  BsmtExposure      BsmtFinType1
##  Gd         : 1   NA's:82   Av  : 1      ALQ        : 1   
##  no basement:79             Mn  : 1      BLQ        : 1   
##  TA         : 2             No  : 1      GLQ        : 1   
##                             NA's:79      no basement:79   
##                                                           
##                                                           
##                                                           
##    BsmtFinSF1           BsmtFinType2   BsmtFinSF2        BsmtUnfSF      
##  Min.   :   0.00   no basement:79    Min.   :  0.000   Min.   :  0.000  
##  1st Qu.:   0.00   Rec        : 1    1st Qu.:  0.000   1st Qu.:  0.000  
##  Median :   0.00   Unf        : 2    Median :  0.000   Median :  0.000  
##  Mean   :  34.96                     Mean   :  4.716   Mean   :  4.123  
##  3rd Qu.:   0.00                     3rd Qu.:  0.000   3rd Qu.:  0.000  
##  Max.   :1044.00                     Max.   :382.000   Max.   :240.000  
##  NA's   :1                           NA's   :1         NA's   :1        
##   TotalBsmtSF      BsmtFullBath     BsmtHalfBath   
##  Min.   :   0.0   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:   0.0   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :   0.0   Median :0.0000   Median :0.0000  
##  Mean   :  43.8   Mean   :0.0125   Mean   :0.0125  
##  3rd Qu.:   0.0   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1426.0   Max.   :1.0000   Max.   :1.0000  
##  NA's   :1        NA's   :2        NA's   :2
for(i in 1:nrow(dat)){
  
  if(is.na(dat$BsmtCond[i])){
    
    if(dat$BsmtQual[i] == "no basement"){
      dat$BsmtCond[i] <- "no basement"
    }
    if(dat$BsmtQual[i] != "no basement"){
      dat$BsmtCond[i] <- dat$BsmtQual[i]
    }
    
  }
}

summary(as.factor(dat$BsmtCond))
##          Fa          Gd no basement          Po          TA 
##         104         123          79           5        2608
summary(as.factor(dat$BsmtQual))
##          Ex          Fa          Gd no basement          TA 
##         258          89        1209          79        1284
#BsmtExposure
dat %>% select(contains("Bsmt")) %>% 
  filter(is.na(BsmtExposure)) %>% 
  mutate_if(is.character, as.factor) %>% 
  summary
##         BsmtQual         BsmtCond  BsmtExposure      BsmtFinType1
##  Gd         : 3   no basement:79   NA's:82      no basement:79   
##  no basement:79   TA         : 3                Unf        : 3   
##                                                                  
##                                                                  
##                                                                  
##                                                                  
##                                                                  
##    BsmtFinSF1      BsmtFinType2   BsmtFinSF2   BsmtUnfSF     
##  Min.   :0    no basement:79    Min.   :0    Min.   :   0.0  
##  1st Qu.:0    Unf        : 3    1st Qu.:0    1st Qu.:   0.0  
##  Median :0                      Median :0    Median :   0.0  
##  Mean   :0                      Mean   :0    Mean   :  40.2  
##  3rd Qu.:0                      3rd Qu.:0    3rd Qu.:   0.0  
##  Max.   :0                      Max.   :0    Max.   :1595.0  
##  NA's   :1                      NA's   :1    NA's   :1       
##   TotalBsmtSF      BsmtFullBath  BsmtHalfBath
##  Min.   :   0.0   Min.   :0     Min.   :0    
##  1st Qu.:   0.0   1st Qu.:0     1st Qu.:0    
##  Median :   0.0   Median :0     Median :0    
##  Mean   :  40.2   Mean   :0     Mean   :0    
##  3rd Qu.:   0.0   3rd Qu.:0     3rd Qu.:0    
##  Max.   :1595.0   Max.   :0     Max.   :0    
##  NA's   :1        NA's   :2     NA's   :2
dat %>% select(contains("Bsmt")) %>% 
  filter(BsmtQual == "Gd") %>% 
  group_by(BsmtExposure) %>% 
  tally()
## # A tibble: 5 x 2
##   BsmtExposure     n
##   <chr>        <int>
## 1 <NA>             3
## 2 Av             261
## 3 Gd             139
## 4 Mn             112
## 5 No             694
for(i in 1:nrow(dat)){
  if(is.na(dat$BsmtExposure[i])){
    
    if(dat$BsmtQual[i] == "no basement"){
      dat$BsmtExposure[i] <- "no basement"
    }
    if(dat$BsmtQual[i] != "no basement"){
      dat$BsmtExposure[i] <- "No"
    }
  }
}



dat %>% select(starts_with("Bsmt")) %>% 
  select_if(is.character) %>% 
  mutate_if(is.character, as.factor) %>% 
  summary
##         BsmtQual           BsmtCond         BsmtExposure 
##  Ex         : 258   Fa         : 104   Av         : 418  
##  Fa         :  89   Gd         : 123   Gd         : 276  
##  Gd         :1209   no basement:  79   Mn         : 239  
##  no basement:  79   Po         :   5   No         :1907  
##  TA         :1284   TA         :2608   no basement:  79  
##                                                          
##                                                          
##       BsmtFinType1      BsmtFinType2 
##  ALQ        :429   ALQ        :  53  
##  BLQ        :269   BLQ        :  68  
##  GLQ        :849   GLQ        :  34  
##  LwQ        :154   LwQ        :  87  
##  no basement: 79   no basement:  79  
##  Rec        :288   Rec        : 105  
##  Unf        :851   Unf        :2493
#No Na values

Basement variables - numeric variables

dat %>% select(contains("Bsmt")) %>%
  select_if(is.numeric) %>% summary
##    BsmtFinSF1       BsmtFinSF2        BsmtUnfSF       TotalBsmtSF    
##  Min.   :   0.0   Min.   :   0.00   Min.   :   0.0   Min.   :   0.0  
##  1st Qu.:   0.0   1st Qu.:   0.00   1st Qu.: 220.0   1st Qu.: 793.0  
##  Median : 368.5   Median :   0.00   Median : 467.0   Median : 989.5  
##  Mean   : 441.4   Mean   :  49.58   Mean   : 560.8   Mean   :1051.8  
##  3rd Qu.: 733.0   3rd Qu.:   0.00   3rd Qu.: 805.5   3rd Qu.:1302.0  
##  Max.   :5644.0   Max.   :1526.00   Max.   :2336.0   Max.   :6110.0  
##  NA's   :1        NA's   :1         NA's   :1        NA's   :1       
##   BsmtFullBath     BsmtHalfBath    
##  Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000  
##  Mean   :0.4299   Mean   :0.06136  
##  3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :3.0000   Max.   :2.00000  
##  NA's   :2        NA's   :2
dat %>% select(contains("Bsmt")) %>% filter(is.na(BsmtFinSF1))
##      BsmtQual    BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 1 no basement no basement  no basement  no basement         NA
##   BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath
## 1  no basement         NA        NA          NA           NA           NA
dat %>% select(contains("Bsmt")) %>% filter(is.na(BsmtFullBath))
##      BsmtQual    BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 1 no basement no basement  no basement  no basement         NA
## 2 no basement no basement  no basement  no basement          0
##   BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF BsmtFullBath BsmtHalfBath
## 1  no basement         NA        NA          NA           NA           NA
## 2  no basement          0         0           0           NA           NA
#It's all no basement
dat$BsmtFinSF1[is.na(dat$BsmtFinSF1)] <- 0
dat$BsmtFinSF2[is.na(dat$BsmtFinSF2)] <- 0
dat$BsmtUnfSF[is.na(dat$BsmtUnfSF)] <- 0
dat$BsmtFullBath[is.na(dat$BsmtFullBath)] <- 0
dat$BsmtHalfBath[is.na(dat$BsmtHalfBath)] <- 0


dat %>% select(contains("Bsmt")) %>% 
  mutate_if(is.character, as.factor) %>% 
  summary
##         BsmtQual           BsmtCond         BsmtExposure 
##  Ex         : 258   Fa         : 104   Av         : 418  
##  Fa         :  89   Gd         : 123   Gd         : 276  
##  Gd         :1209   no basement:  79   Mn         : 239  
##  no basement:  79   Po         :   5   No         :1907  
##  TA         :1284   TA         :2608   no basement:  79  
##                                                          
##                                                          
##       BsmtFinType1   BsmtFinSF1          BsmtFinType2    BsmtFinSF2     
##  ALQ        :429   Min.   :   0.0   ALQ        :  53   Min.   :   0.00  
##  BLQ        :269   1st Qu.:   0.0   BLQ        :  68   1st Qu.:   0.00  
##  GLQ        :849   Median : 368.0   GLQ        :  34   Median :   0.00  
##  LwQ        :154   Mean   : 441.3   LwQ        :  87   Mean   :  49.57  
##  no basement: 79   3rd Qu.: 733.0   no basement:  79   3rd Qu.:   0.00  
##  Rec        :288   Max.   :5644.0   Rec        : 105   Max.   :1526.00  
##  Unf        :851                    Unf        :2493                    
##    BsmtUnfSF       TotalBsmtSF      BsmtFullBath     BsmtHalfBath    
##  Min.   :   0.0   Min.   :   0.0   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.: 220.0   1st Qu.: 793.0   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median : 467.0   Median : 989.5   Median :0.0000   Median :0.00000  
##  Mean   : 560.6   Mean   :1051.8   Mean   :0.4296   Mean   :0.06132  
##  3rd Qu.: 805.0   3rd Qu.:1302.0   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :2336.0   Max.   :6110.0   Max.   :3.0000   Max.   :2.00000  
##                   NA's   :1
dat <- dat %>% mutate_if(is.character, as.factor)

MasVnrType NA values

summary(as.factor(dat$MasVnrType))
##  BrkCmn BrkFace    None   Stone    NA's 
##      25     879    1742     249      24
dat %>% select(contains("MasVnr")) %>% summary
##    MasVnrType     MasVnrArea    
##  BrkCmn :  25   Min.   :   0.0  
##  BrkFace: 879   1st Qu.:   0.0  
##  None   :1742   Median :   0.0  
##  Stone  : 249   Mean   : 102.2  
##  NA's   :  24   3rd Qu.: 164.0  
##                 Max.   :1600.0  
##                 NA's   :23
dat %>% select(contains("MasVnr")) %>% 
  mutate_if(is.character, as.factor) %>%
  group_by(MasVnrType) %>% 
  summarise(count = n(),
            mean = mean(MasVnrArea, na.rm=TRUE),
            median =median(MasVnrArea, na.rm=TRUE))
## Warning: Factor `MasVnrType` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 5 x 4
##   MasVnrType count    mean median
##   <fct>      <int>   <dbl>  <dbl>
## 1 BrkCmn        25 195.       161
## 2 BrkFace      879 262.       203
## 3 None        1742   0.707      0
## 4 Stone        249 240.       200
## 5 <NA>          24 198        198
dat %>% select(starts_with("MasVnr")) %>%
  mutate_if(is.character, as.factor) %>%
  filter(is.na(MasVnrType))
##    MasVnrType MasVnrArea
## 1        <NA>         NA
## 2        <NA>         NA
## 3        <NA>         NA
## 4        <NA>         NA
## 5        <NA>         NA
## 6        <NA>         NA
## 7        <NA>         NA
## 8        <NA>         NA
## 9        <NA>         NA
## 10       <NA>         NA
## 11       <NA>         NA
## 12       <NA>         NA
## 13       <NA>         NA
## 14       <NA>         NA
## 15       <NA>         NA
## 16       <NA>         NA
## 17       <NA>         NA
## 18       <NA>         NA
## 19       <NA>         NA
## 20       <NA>         NA
## 21       <NA>        198
## 22       <NA>         NA
## 23       <NA>         NA
## 24       <NA>         NA
#There's NA values and one observation has MasVnrArea value, 198
#and its mean value is close to the group mean of BrkCmn


#Fill MasVnrArea NA values first
dat$MasVnrArea[is.na(dat$MasVnrArea)] <- 0

dat %>% select(contains("MasVnr")) %>% 
  mutate_if(is.character, as.factor) %>%
  filter(is.na(MasVnrType) & MasVnrArea !=0)
##   MasVnrType MasVnrArea
## 1       <NA>        198
for(i in 1:nrow(dat)){
  if(is.na(dat$MasVnrType[i])){
    
    if(dat$MasVnrArea[i] == 0){
      dat$MasVnrType[i] <- "None"
    }
    if(dat$MasVnrArea[i] != 0){
      dat$MasVnrType[i] <- "BrkCmn"
    }
    
  }
}

dat %>% select(contains("MasVnr")) %>% 
  mutate_if(is.character, as.factor) %>% 
  summary
##    MasVnrType     MasVnrArea    
##  BrkCmn :  26   Min.   :   0.0  
##  BrkFace: 879   1st Qu.:   0.0  
##  None   :1765   Median :   0.0  
##  Stone  : 249   Mean   : 101.4  
##                 3rd Qu.: 163.5  
##                 Max.   :1600.0
#No NA Values

MSZoning NA values

summary(as.factor(dat$MSZoning))
## C (all)      FV      RH      RL      RM    NA's 
##      25     139      26    2265     460       4
#MSZoning: Identifies the general zoning classification of the sale.

dat %>% filter(is.na(MSZoning))
##   MSSubClass MSZoning LotFrontage LotArea Street    Alley LotShape
## 1         30     <NA>    109.0000   21780   Grvl no alley      Reg
## 2         20     <NA>     80.0000   14584   Pave no alley      Reg
## 3         70     <NA>    102.4526   56600   Pave no alley      IR1
## 4         20     <NA>    125.0000   31250   Pave no alley      Reg
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl      <NA>    Inside       Gtl       IDOTRR       Norm
## 2         Low    AllPub    Inside       Mod       IDOTRR       Norm
## 3         Low    AllPub    Inside       Gtl       IDOTRR       Norm
## 4         Lvl    AllPub    Inside       Gtl      Mitchel     Artery
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     1Story           2           4      1910
## 2       Norm     1Fam     1Story           1           5      1952
## 3       Norm     1Fam     2.5Unf           5           1      1900
## 4       Norm     1Fam     1Story           1           3      1951
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         1950     Gable  CompShg     Wd Sdng     Wd Sdng       None
## 2         1952     Gable  CompShg     AsbShng     VinylSd       None
## 3         1950       Hip  CompShg     Wd Sdng     Wd Sdng       None
## 4         1951     Gable  CompShg      CBlock     VinylSd       None
##   MasVnrArea ExterQual ExterCond Foundation    BsmtQual    BsmtCond
## 1          0        Fa        Fa     CBlock no basement no basement
## 2          0        Fa        Po       Slab no basement no basement
## 3          0        TA        TA     BrkTil          TA          TA
## 4          0        TA        Fa     CBlock no basement no basement
##   BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
## 1  no basement  no basement          0  no basement          0         0
## 2  no basement  no basement          0  no basement          0         0
## 3           No          Unf          0          Unf          0       686
## 4  no basement  no basement          0  no basement          0         0
##   TotalBsmtSF Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 1           0    GasA        TA          N      FuseA       810         0
## 2           0    Wall        Po          N      FuseA       733         0
## 3         686    GasA        Ex          Y      SBrkr      1150       686
## 4           0    GasA        TA          Y      FuseA      1600         0
##   LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 1            0       810            0            0        1        0
## 2            0       733            0            0        1        0
## 3            0      1836            0            0        2        0
## 4            0      1600            0            0        1        1
##   BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces
## 1            1            1          TA            4       Min1          0
## 2            2            1          Fa            4       <NA>          0
## 3            4            1          TA            7       Maj1          0
## 4            3            1          TA            6        Mod          0
##    FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea
## 1 no fireplace     Detchd        1975          Unf          1        280
## 2 no fireplace     Attchd        1952          Unf          2        487
## 3 no fireplace     Detchd        1900          Unf          1        288
## 4 no fireplace     Attchd        1951          Unf          1        270
##   GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## 1         TA         TA          N        119          24             0
## 2         Fa         Po          N          0           0             0
## 3         TA         Fa          N          0           0             0
## 4         Fa         TA          N          0           0           135
##   X3SsnPorch ScreenPorch PoolArea  PoolQC    Fence MiscFeature MiscVal
## 1          0           0        0 no pool no fence        None       0
## 2          0           0        0 no pool no fence        None       0
## 3          0           0        0 no pool no fence        None       0
## 4          0           0        0 no pool no fence        None       0
##   MoSold YrSold SaleType SaleCondition SalePrice
## 1      3   2009    ConLD        Normal        NA
## 2      2   2008       WD       Abnorml        NA
## 3      1   2008       WD        Normal        NA
## 4      5   2006       WD        Normal        NA
dat %>% 
  group_by(MSZoning) %>%
  summarise(count = n(),
            mean = mean(SalePrice, na.rm=TRUE),
            median = median(SalePrice, na.rm = TRUE),
            sd = sd(SalePrice, na.rm = TRUE))
## Warning: Factor `MSZoning` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 6 x 5
##   MSZoning count    mean median     sd
##   <fct>    <int>   <dbl>  <dbl>  <dbl>
## 1 C (all)     25  74528   74700 33791.
## 2 FV         139 214014. 205950 52370.
## 3 RH          26 131558. 136500 35714.
## 4 RL        2265 191005. 174000 80766.
## 5 RM         460 126317. 120500 48522.
## 6 <NA>         4    NaN      NA   NaN
#4 NA values, but they are in test set

#Investigating MSzoning by MSSubClass 
dat %>% filter(MSSubClass %in% c(30, 20, 70)) %>%
  group_by(MSSubClass, MSZoning) %>%
  summarise(count = n(),
            mean = mean(SalePrice, na.rm=TRUE),
            median = median(SalePrice, na.rm = TRUE),
            sd = sd(SalePrice, na.rm = TRUE))
## Warning: Factor `MSZoning` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 16 x 6
## # Groups:   MSSubClass [3]
##    MSSubClass MSZoning count    mean  median     sd
##    <fct>      <fct>    <int>   <dbl>   <dbl>  <dbl>
##  1 20         C (all)      3  45652   45652  14624.
##  2 20         FV          34 226290. 222000  52546.
##  3 20         RH           4 102967. 107000  19269.
##  4 20         RL        1016 186467. 159698. 78949.
##  5 20         RM          20 121328. 120000  19839.
##  6 20         <NA>         2    NaN      NA    NaN 
##  7 30         C (all)      8  57950   57950  32598.
##  8 30         RH           2  79000   79000     NA 
##  9 30         RL          61  96481.  91000  26256.
## 10 30         RM          67  97984. 102000  22060.
## 11 30         <NA>         1    NaN      NA     NA 
## 12 70         C (all)      4  40000   40000     NA 
## 13 70         RH           3 124533. 130000  18806.
## 14 70         RL          57 199809. 188850  56026.
## 15 70         RM          63 138403. 134750  36368.
## 16 70         <NA>         1    NaN      NA     NA
#NA values will be replaced by the group mean
#NA values in MSSubClass 20 will be replaced as RL
#NA values in MSSubClass 30 will be replaced as RM
#NA values in MSSubClass 70 will be replaced as RM


dat$MSZoning[is.na(dat$MSZoning) & dat$MSSubClass == 20] <- "RL"
dat$MSZoning[is.na(dat$MSZoning) & dat$MSSubClass == 30] <- "RM"
dat$MSZoning[is.na(dat$MSZoning) & dat$MSSubClass == 70] <- "RM"



summary(as.factor(dat$MSZoning))
## C (all)      FV      RH      RL      RM 
##      25     139      26    2267     462

Other NA values

NAtable1 <- data.frame(dat %>% summarise_all(funs(sum(is.na(.)))))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
##   # Before:
##   funs(name = f(.))
## 
##   # After: 
##   list(name = ~ f(.))
## This warning is displayed once per session.
NAtable1 <- NAtable1[,which(NAtable1>0)]
NAtable1 <- NAtable1 %>% subset(select = -c(SalePrice))
dim(NAtable1)
## [1] 1 8
NAtable1 <- NAtable1[order(NAtable1, decreasing=TRUE)]

NAtable1
##   Utilities Functional Exterior1st Exterior2nd TotalBsmtSF Electrical
## 1         2          2           1           1           1          1
##   KitchenQual SaleType
## 1           1        1
#Utilities
summary(as.factor(dat$Utilities))
## AllPub NoSeWa   NA's 
##   2916      1      2
dat$Utilities[is.na(dat$Utilities)] <- "AllPub"

#Functional
summary(as.factor(dat$Functional))
## Maj1 Maj2 Min1 Min2  Mod  Sev  Typ NA's 
##   19    9   65   70   35    2 2717    2
dat$Functional[is.na(dat$Functional)] <- "Typ"

#Exterior1st and Exterior2nd
summary(as.factor(dat$Exterior1st))
## AsbShng AsphShn BrkComm BrkFace  CBlock CemntBd HdBoard ImStucc MetalSd 
##      44       2       6      87       2     126     442       1     450 
## Plywood   Stone  Stucco VinylSd Wd Sdng WdShing    NA's 
##     221       2      43    1025     411      56       1
summary(as.factor(dat$Exterior2nd))
## AsbShng AsphShn Brk Cmn BrkFace  CBlock CmentBd HdBoard ImStucc MetalSd 
##      38       4      22      47       3     126     406      15     447 
##   Other Plywood   Stone  Stucco VinylSd Wd Sdng Wd Shng    NA's 
##       1     270       6      47    1014     391      81       1
dat %>% filter(is.na(Exterior1st))
##   MSSubClass MSZoning LotFrontage LotArea Street    Alley LotShape
## 1         30       RL          85   19550   Pave no alley      Reg
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl      Edwards       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     1Story           5           7      1940
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         2007      Flat  Tar&Grv        <NA>        <NA>       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1          0        TA        TA      PConc       TA       TA           Gd
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          ALQ       1035          Unf          0       545        1580
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        Ex          Y      SBrkr      1518         0            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1518            1            0        1        0            2
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Fa            5        Typ          2          Gd
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1  no garage           0    no garage          0          0  no garage
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1  no garage          Y          0          39             0          0
##   ScreenPorch PoolArea  PoolQC    Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0 no pool no fence        None       0      1   2008
##   SaleType SaleCondition SalePrice
## 1       WD        Normal        NA
dat$Exterior1st[is.na(dat$Exterior1st)] <- "VinylSd"
dat$Exterior2nd[is.na(dat$Exterior2nd)] <- "VinylSd"

#TotalBsmtSF
summary(dat$TotalBsmtSF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0   793.0   989.5  1051.8  1302.0  6110.0       1
dat %>% filter(is.na(TotalBsmtSF))
##   MSSubClass MSZoning LotFrontage LotArea Street    Alley LotShape
## 1         20       RM          99    5940   Pave no alley      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub       FR3       Gtl      BrkSide      Feedr
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     1Story           4           7      1946
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         1950     Gable  CompShg     MetalSd      CBlock       None
##   MasVnrArea ExterQual ExterCond Foundation    BsmtQual    BsmtCond
## 1          0        TA        TA      PConc no basement no basement
##   BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
## 1  no basement  no basement          0  no basement          0         0
##   TotalBsmtSF Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 1          NA    GasA        TA          Y      FuseA       896         0
##   LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 1            0       896            0            0        1        0
##   BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces
## 1            2            1          TA            4        Typ          0
##    FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea
## 1 no fireplace     Detchd        1946          Unf          1        280
##   GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## 1         TA         TA          Y          0           0             0
##   X3SsnPorch ScreenPorch PoolArea  PoolQC Fence MiscFeature MiscVal MoSold
## 1          0           0        0 no pool MnPrv        None       0      4
##   YrSold SaleType SaleCondition SalePrice
## 1   2008    ConLD       Abnorml        NA
#no basement

dat %>% filter(!is.na(TotalBsmtSF)) %>% 
  group_by(BsmtCond) %>% 
  summarise(count = n(),
            median = median(TotalBsmtSF, na.rm=TRUE))
## # A tibble: 5 x 3
##   BsmtCond    count median
##   <fct>       <int>  <dbl>
## 1 Fa            104   814.
## 2 Gd            123  1094 
## 3 no basement    78     0 
## 4 Po              5   936 
## 5 TA           2608  1008
#no basement -> 0

dat$TotalBsmtSF[is.na(dat$TotalBsmtSF)] <- 0



#Electrical
summary(as.factor(dat$Electrical))
## FuseA FuseF FuseP   Mix SBrkr  NA's 
##   188    50     8     1  2671     1
dat$Electrical[is.na(dat$Electrical)] <- "SBrkr"


#KitchenQual
summary(as.factor(dat$KitchenQual))
##   Ex   Fa   Gd   TA NA's 
##  205   70 1151 1492    1
dat %>% filter(is.na(KitchenQual))
##   MSSubClass MSZoning LotFrontage LotArea Street    Alley LotShape
## 1         50       RL          72   10632   Pave no alley      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl      ClearCr       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     1.5Fin           5           3      1917
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         1950     Gable  CompShg     Wd Sdng     Wd Sdng       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1          0        TA        TA     BrkTil       Gd       Fa           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          Unf          0          Unf          0       689         689
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        Gd          N      SBrkr       725       499            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1224            0            0        1        1            3
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces  FireplaceQu
## 1            1        <NA>            6        Mod          0 no fireplace
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Detchd        1917          Unf          1        180         Fa
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         Fa          N          0           0           248          0
##   ScreenPorch PoolArea  PoolQC    Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0 no pool no fence        None       0      1   2010
##   SaleType SaleCondition SalePrice
## 1      COD        Normal        NA
dat$KitchenQual[dat$RoofStyle == "Gable" & dat$HouseStyle == "1.5Fin"] %>% table
## .
##  Ex  Fa  Gd  TA 
##   4  17  56 222
dat$KitchenQual[is.na(dat$KitchenQual)] <- "TA"


#SaleType
dat %>% filter(is.na(SaleType))
##   MSSubClass MSZoning LotFrontage LotArea Street    Alley LotShape
## 1         20       RL          85   13770   Pave no alley      Reg
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Corner       Gtl       Sawyer      Feedr
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     1Story           5           6      1958
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         1998     Gable  CompShg     Plywood     Plywood    BrkFace
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1        340        TA        TA     CBlock       TA       TA           Mn
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          Rec        190          BLQ        873        95        1158
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        TA          Y      SBrkr      1176         0            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1176            1            0        1        0            3
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          TA            6        Typ          2          Gd
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        1958          Unf          1        303         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         TA          Y          0           0             0          0
##   ScreenPorch PoolArea  PoolQC    Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0 no pool no fence        None       0     10   2007
##   SaleType SaleCondition SalePrice
## 1     <NA>        Normal        NA
summary(as.factor(dat$SaleType))
##   COD   Con ConLD ConLI ConLw   CWD   New   Oth    WD  NA's 
##    87     5    26     9     8    12   239     7  2525     1
dat$SaleType[is.na(dat$SaleType)] <- "WD"



dat %>% filter(!is.na(SalePrice)) %>% filter(!complete.cases(.),)
##  [1] MSSubClass    MSZoning      LotFrontage   LotArea       Street       
##  [6] Alley         LotShape      LandContour   Utilities     LotConfig    
## [11] LandSlope     Neighborhood  Condition1    Condition2    BldgType     
## [16] HouseStyle    OverallQual   OverallCond   YearBuilt     YearRemodAdd 
## [21] RoofStyle     RoofMatl      Exterior1st   Exterior2nd   MasVnrType   
## [26] MasVnrArea    ExterQual     ExterCond     Foundation    BsmtQual     
## [31] BsmtCond      BsmtExposure  BsmtFinType1  BsmtFinSF1    BsmtFinType2 
## [36] BsmtFinSF2    BsmtUnfSF     TotalBsmtSF   Heating       HeatingQC    
## [41] CentralAir    Electrical    X1stFlrSF     X2ndFlrSF     LowQualFinSF 
## [46] GrLivArea     BsmtFullBath  BsmtHalfBath  FullBath      HalfBath     
## [51] BedroomAbvGr  KitchenAbvGr  KitchenQual   TotRmsAbvGrd  Functional   
## [56] Fireplaces    FireplaceQu   GarageType    GarageYrBlt   GarageFinish 
## [61] GarageCars    GarageArea    GarageQual    GarageCond    PavedDrive   
## [66] WoodDeckSF    OpenPorchSF   EnclosedPorch X3SsnPorch    ScreenPorch  
## [71] PoolArea      PoolQC        Fence         MiscFeature   MiscVal      
## [76] MoSold        YrSold        SaleType      SaleCondition SalePrice    
## <0 rows> (or 0-length row.names)
#We don't have any NA values now

dat <- dat %>% mutate_if(is.integer, as.numeric)
dat <- dat %>% mutate_if(is.character, as.factor)

Examine Factor Predictors

#factor first
dat %>% select_if(is.factor) %>% summary
##    MSSubClass      MSZoning     Street          Alley      LotShape  
##  20     :1079   C (all):  25   Grvl:  12   Gravel  : 120   IR1: 968  
##  60     : 575   FV     : 139   Pave:2907   no alley:2721   IR2:  76  
##  50     : 287   RH     :  26               Paved   :  78   IR3:  16  
##  120    : 182   RL     :2267                               Reg:1859  
##  30     : 139   RM     : 462                                         
##  70     : 128                                                        
##  (Other): 529                                                        
##  LandContour  Utilities      LotConfig    LandSlope   Neighborhood 
##  Bnk: 117    AllPub:2918   Corner : 511   Gtl:2778   NAmes  : 443  
##  HLS: 120    NoSeWa:   1   CulDSac: 176   Mod: 125   CollgCr: 267  
##  Low:  60                  FR2    :  85   Sev:  16   OldTown: 239  
##  Lvl:2622                  FR3    :  14              Edwards: 194  
##                            Inside :2133              Somerst: 182  
##                                                      NridgHt: 166  
##                                                      (Other):1428  
##    Condition1     Condition2     BldgType      HouseStyle     RoofStyle   
##  Norm   :2511   Norm   :2889   1Fam  :2425   1Story :1471   Flat   :  20  
##  Feedr  : 164   Feedr  :  13   2fmCon:  62   2Story : 872   Gable  :2310  
##  Artery :  92   Artery :   5   Duplex: 109   1.5Fin : 314   Gambrel:  22  
##  RRAn   :  50   PosA   :   4   Twnhs :  96   SLvl   : 128   Hip    : 551  
##  PosN   :  39   PosN   :   4   TwnhsE: 227   SFoyer :  83   Mansard:  11  
##  RRAe   :  28   RRNn   :   2                 2.5Unf :  24   Shed   :   5  
##  (Other):  35   (Other):   2                 (Other):  27                 
##     RoofMatl     Exterior1st    Exterior2nd     MasVnrType   ExterQual
##  CompShg:2876   VinylSd:1026   VinylSd:1015   BrkCmn :  26   Ex: 107  
##  Tar&Grv:  23   MetalSd: 450   MetalSd: 447   BrkFace: 879   Fa:  35  
##  WdShake:   9   HdBoard: 442   HdBoard: 406   None   :1765   Gd: 979  
##  WdShngl:   7   Wd Sdng: 411   Wd Sdng: 391   Stone  : 249   TA:1798  
##  ClyTile:   1   Plywood: 221   Plywood: 270                           
##  Membran:   1   CemntBd: 126   CmentBd: 126                           
##  (Other):   2   (Other): 243   (Other): 264                           
##  ExterCond  Foundation          BsmtQual           BsmtCond   
##  Ex:  12   BrkTil: 311   Ex         : 258   Fa         : 104  
##  Fa:  67   CBlock:1235   Fa         :  89   Gd         : 123  
##  Gd: 299   PConc :1308   Gd         :1209   no basement:  79  
##  Po:   3   Slab  :  49   no basement:  79   Po         :   5  
##  TA:2538   Stone :  11   TA         :1284   TA         :2608  
##            Wood  :   5                                        
##                                                               
##       BsmtExposure       BsmtFinType1      BsmtFinType2   Heating    
##  Av         : 418   ALQ        :429   ALQ        :  53   Floor:   1  
##  Gd         : 276   BLQ        :269   BLQ        :  68   GasA :2874  
##  Mn         : 239   GLQ        :849   GLQ        :  34   GasW :  27  
##  No         :1907   LwQ        :154   LwQ        :  87   Grav :   9  
##  no basement:  79   no basement: 79   no basement:  79   OthW :   2  
##                     Rec        :288   Rec        : 105   Wall :   6  
##                     Unf        :851   Unf        :2493               
##  HeatingQC CentralAir Electrical   KitchenQual Functional 
##  Ex:1493   N: 196     FuseA: 188   Ex: 205     Maj1:  19  
##  Fa:  92   Y:2723     FuseF:  50   Fa:  70     Maj2:   9  
##  Gd: 474              FuseP:   8   Gd:1151     Min1:  65  
##  Po:   3              Mix  :   1   TA:1493     Min2:  70  
##  TA: 857              SBrkr:2672               Mod :  35  
##                                                Sev :   2  
##                                                Typ :2719  
##        FireplaceQu       GarageType      GarageFinish      GarageQual  
##  Ex          :  43   2Types   :  23   Fin      : 719   Ex       :   3  
##  Fa          :  74   Attchd   :1723   no garage: 157   Fa       : 124  
##  Gd          : 744   Basment  :  36   RFn      : 811   Gd       :  24  
##  no fireplace:1420   BuiltIn  : 186   Unf      :1232   no garage: 157  
##  Po          :  46   CarPort  :  15                    Po       :   5  
##  TA          : 592   Detchd   : 779                    TA       :2606  
##                      no garage: 157                                    
##      GarageCond   PavedDrive     PoolQC          Fence      MiscFeature
##  Ex       :   3   N: 216     Ex     :   4   GdPrv   : 118   Gar2:   5  
##  Fa       :  74   P:  62     Fa     :   2   GdWo    : 112   None:2814  
##  Gd       :  15   Y:2641     Gd     :   4   MnPrv   : 329   Othr:   4  
##  no garage: 157              no pool:2909   MnWw    :  12   Shed:  95  
##  Po       :  14                             no fence:2348   TenC:   1  
##  TA       :2656                                                        
##                                                                        
##     SaleType    SaleCondition 
##  WD     :2526   Abnorml: 190  
##  New    : 239   AdjLand:  12  
##  COD    :  87   Alloca :  24  
##  ConLD  :  26   Family :  46  
##  CWD    :  12   Normal :2402  
##  ConLI  :   9   Partial: 245  
##  (Other):  20
#Factor Variables mostly skewed as the table shows

fac.vars <- dat %>% select_if(is.factor) %>% colnames


not.imp.fac <- NULL
for(i in fac.vars){
  if(any(prop.table(matrix(table(dat[,i]))) > 0.95)){
    not.imp.fac <- c(not.imp.fac, i)
  }
}

#These variables will have a factor level with 95% proportion in the variables
not.imp.fac
## [1] "Street"      "Utilities"   "LandSlope"   "Condition2"  "RoofMatl"   
## [6] "Heating"     "PoolQC"      "MiscFeature"
prop.func("Street") 
##             [,1]
## [1,] 0.004110997
## [2,] 0.995889003
table(dat$Street) #Pave has greater than 99% proportion in the Street
## 
## Grvl Pave 
##   12 2907
prop.func("Alley") #Alley skewed
##            [,1]
## [1,] 0.04110997
## [2,] 0.93216855
## [3,] 0.02672148
prop.func("Utilities")
##              [,1]
## [1,] 0.9996574169
## [2,] 0.0003425831

Examine Variables

#Two skewed factor variables 
chisq.test(dat$Street, dat$Alley, simulate.p.value = TRUE)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  dat$Street and dat$Alley
## X-squared = 0.87681, df = NA, p-value = 0.7801
#Chisq Test : we reject the null.. = both dependent 
#It's obvious result since both variables have too many observations for one factor level


summary(aov(lm(SalePrice ~ Street, data = dat %>% filter(!is.na(SalePrice)))))
##               Df    Sum Sq   Mean Sq F value Pr(>F)
## Street         1 1.551e+10 1.551e+10   2.459  0.117
## Residuals   1458 9.192e+12 6.305e+09
#Anova test shows p value more than 0.05, 
#which means that all group means are almost equal

t.test(SalePrice~Street, data = dat %>% filter(!is.na(SalePrice)))
## 
##  Welch Two Sample t-test
## 
## data:  SalePrice by Street
## t = -1.9008, df = 5.061, p-value = 0.115
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -119581.21   17701.13
## sample estimates:
## mean in group Grvl mean in group Pave 
##           130190.5           181130.5
#P value higher than 0.05
#We are not confident that the difference 
#between group means will fall in the confidence interval containing 0

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=Street, y=SalePrice)) + geom_boxplot()

#But it won't tell you whether we have to choose these factor variables as predictors or not.
#However, we still can see the relationship between factor predictors 

#creating variables names
variables <- dat %>% 
  subset(select= -c(YearBuilt, YearRemodAdd, GarageYrBlt, YrSold, MoSold)) %>% 
  colnames

#dummy data
test.data <- data.frame(cols = variables)

data.pval <- relationship.test(variables, test.data, 
                               dat %>% 
                                 filter(!is.na(SalePrice)) %>% 
                                 subset(select= -c(YearBuilt, YearRemodAdd, GarageYrBlt, YrSold, MoSold)))


data.pval %>% select(cols,SalePrice, not.imp.fac) %>% head
##          cols SalePrice Street Utilities LandSlope Condition2 RoofMatl
## 1  MSSubClass     0.000  0.105     1.000     0.041      0.008    0.091
## 2    MSZoning     0.000  0.001     1.000     0.023      0.154    0.884
## 3 LotFrontage     0.347  0.030     0.484     0.000      0.092    0.000
## 4     LotArea     0.264  0.000     0.699     0.000      0.430    0.000
## 5      Street     0.117  0.000     1.000     0.001      1.000    1.000
## 6       Alley     0.000  1.000     1.000     0.826      1.000    1.000
##   Heating PoolQC MiscFeature
## 1   0.006  0.094       0.005
## 2   0.115  1.000       0.287
## 3   0.849  0.000       0.745
## 4   0.000  0.002       0.001
## 5   1.000  1.000       0.019
## 6   0.006  1.000       0.333
#factor vs factor : if <0.05 (p value), independent, if not, dependent
#factor vs numeric : if <0.05, at least one factor has different mean than others (dependent). 
#if not, all factor has similar mean (non linear, independent)
#numeric vs numeric : if <0.5, low correlation, if not, high correlation

#we notice that those variables are highly dependent to other variables 



#Let's examine all predictors deeply

BldgType

dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=BldgType, y=SalePrice, fill=BldgType))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(BldgType) %>% summarise(count = n(),
                                   mean = mean(SalePrice),
                                   median = median(SalePrice))
## # A tibble: 5 x 4
##   BldgType count    mean median
##   <fct>    <int>   <dbl>  <dbl>
## 1 1Fam      1220 185764. 167900
## 2 2fmCon      31 128432. 127500
## 3 Duplex      52 133541. 135980
## 4 Twnhs       43 135912. 137500
## 5 TwnhsE     114 181959. 172200
table(dat$BldgType)
## 
##   1Fam 2fmCon Duplex  Twnhs TwnhsE 
##   2425     62    109     96    227
train.test.graph("BldgType", dat)

#creating new BldgType variable by the group mean of SalePrice
dat$BldgType.new <- NA

dat$BldgType.new[which(dat$BldgType %in% c("2fmCon", "Duplex", "Twnhs"))] <- "low bldg"
dat$BldgType.new[which(dat$BldgType %in% c("1Fam", "TwnhsE"))] <- "high bldg"

table(dat$BldgType.new)
## 
## high bldg  low bldg 
##      2652       267
dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=BldgType.new, y=SalePrice, fill=BldgType.new))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(BldgType.new) %>% summarise(count = n(),
                                   mean = mean(SalePrice),
                                   median = median(SalePrice))
## # A tibble: 2 x 4
##   BldgType.new count    mean median
##   <chr>        <int>   <dbl>  <dbl>
## 1 high bldg     1334 185439. 168250
## 2 low bldg       126 133093. 134450
train.test.graph("BldgType.new", dat)

#HouseStyle

dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=HouseStyle, y=SalePrice, fill=HouseStyle))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(HouseStyle) %>% summarise(count = n(),
                                   mean = mean(SalePrice),
                                   median = median(SalePrice)) %>% arrange(desc(mean))
## # A tibble: 8 x 4
##   HouseStyle count    mean median
##   <fct>      <int>   <dbl>  <dbl>
## 1 2.5Fin         8 220000  194000
## 2 2Story       445 210052. 190000
## 3 1Story       726 175985. 154750
## 4 SLvl          65 166703. 164500
## 5 2.5Unf        11 157355. 133900
## 6 1.5Fin       154 143117. 132000
## 7 SFoyer        37 135074. 135960
## 8 1.5Unf        14 110150  111250
dat %>% filter(!is.na(SalePrice)) %>%
  group_by(HouseStyle) %>% tally()
## # A tibble: 8 x 2
##   HouseStyle     n
##   <fct>      <int>
## 1 1.5Fin       154
## 2 1.5Unf        14
## 3 1Story       726
## 4 2.5Fin         8
## 5 2.5Unf        11
## 6 2Story       445
## 7 SFoyer        37
## 8 SLvl          65
dat %>% filter(is.na(SalePrice)) %>%
  group_by(HouseStyle) %>% tally()
## # A tibble: 7 x 2
##   HouseStyle     n
##   <fct>      <int>
## 1 1.5Fin       160
## 2 1.5Unf         5
## 3 1Story       745
## 4 2.5Unf        13
## 5 2Story       427
## 6 SFoyer        46
## 7 SLvl          63
train.test.graph("HouseStyle", dat)

#Test set doesn't have 2.5Fin 
#combine 2.5Fin -> 2STory, which have similar group mean

#and creating new HouseStyle variable by group mean of saleprice

dat$HouseStyle <- as.character(dat$HouseStyle)
dat$HouseStyle[dat$HouseStyle == "2.5Fin"] <- "2Story"

table(dat$HouseStyle)
## 
## 1.5Fin 1.5Unf 1Story 2.5Unf 2Story SFoyer   SLvl 
##    314     19   1471     24    880     83    128
dat$HouseStyle.new <- NA

dat$HouseStyle.new[which(dat$HouseStyle == "2Story")] <- "high house"
dat$HouseStyle.new[which(dat$HouseStyle %in% c("1Story", "SLvl"))] <- "medium house" 
dat$HouseStyle.new[which(dat$HouseStyle %in% c("2.5Unf", "1.5Fin", "SFoyer", "1.5Unf"))] <- "low house"

table(dat$HouseStyle.new)
## 
##   high house    low house medium house 
##          880          440         1599
dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=HouseStyle.new, y=SalePrice, fill=HouseStyle.new))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(HouseStyle.new) %>% summarise(count = n(),
                                   mean = mean(SalePrice),
                                   median = median(SalePrice)) %>% arrange(desc(mean))
## # A tibble: 3 x 4
##   HouseStyle.new count    mean median
##   <chr>          <int>   <dbl>  <dbl>
## 1 high house       453 210227. 190000
## 2 medium house     791 175223. 155000
## 3 low house        216 140327. 133000
train.test.graph("HouseStyle.new", dat)

#removing original BldgType and HouseStyle
dat <- dat %>% subset(select = -c(BldgType,HouseStyle))

Roof

dat %>% group_by(RoofStyle, RoofMatl) %>% tally()
## # A tibble: 18 x 3
## # Groups:   RoofStyle [6]
##    RoofStyle RoofMatl     n
##    <fct>     <fct>    <int>
##  1 Flat      CompShg      1
##  2 Flat      Membran      1
##  3 Flat      Metal        1
##  4 Flat      Tar&Grv     17
##  5 Gable     CompShg   2298
##  6 Gable     Roll         1
##  7 Gable     Tar&Grv      6
##  8 Gable     WdShngl      5
##  9 Gambrel   CompShg     22
## 10 Hip       ClyTile      1
## 11 Hip       CompShg    544
## 12 Hip       WdShake      5
## 13 Hip       WdShngl      1
## 14 Mansard   CompShg      8
## 15 Mansard   WdShake      3
## 16 Shed      CompShg      3
## 17 Shed      WdShake      1
## 18 Shed      WdShngl      1
dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=RoofStyle, y=SalePrice, fill=RoofStyle)) +
  geom_boxplot()+
  theme(axis.text.x = element_text(angle=90))

dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=RoofStyle, y=SalePrice, fill=RoofStyle)) +
  geom_bar(stat="summary", fun.y="mean")

dat %>% filter(!is.na(SalePrice)) %>% 
  group_by(RoofStyle) %>% 
  summarise(count = n(),
            mean = mean(SalePrice),
            median = median(SalePrice),
            sd= sd(SalePrice))
## # A tibble: 6 x 5
##   RoofStyle count    mean median      sd
##   <fct>     <int>   <dbl>  <dbl>   <dbl>
## 1 Flat         13 194690  185000  62523.
## 2 Gable      1141 171484. 160000  66331.
## 3 Gambrel      11 148909. 139000  67014.
## 4 Hip         286 218877. 176500 111550.
## 5 Mansard       7 180568. 175000  58058.
## 6 Shed          2 225000  225000  49497.
dat %>% filter(is.na(SalePrice)) %>% group_by(RoofStyle) %>% tally()
## # A tibble: 6 x 2
##   RoofStyle     n
##   <fct>     <int>
## 1 Flat          7
## 2 Gable      1169
## 3 Gambrel      11
## 4 Hip         265
## 5 Mansard       4
## 6 Shed          3
dat %>% filter(!is.na(SalePrice)) %>% group_by(RoofStyle) %>% tally()
## # A tibble: 6 x 2
##   RoofStyle     n
##   <fct>     <int>
## 1 Flat         13
## 2 Gable      1141
## 3 Gambrel      11
## 4 Hip         286
## 5 Mansard       7
## 6 Shed          2
train.test.graph("RoofStyle", dat)

#RoofStyle may not be a good predictor, let's keep this anyway


#RoofMatl
dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=RoofMatl, y=SalePrice, fill=RoofMatl)) +
  geom_boxplot()+
  theme(axis.text.x = element_text(angle=90))

dat %>% filter(!is.na(SalePrice)) %>% 
  group_by(RoofMatl) %>% 
  summarise(count = n(),
            mean = mean(SalePrice),
            median = median(SalePrice),
            min = min(SalePrice),
            max = max(SalePrice))
## # A tibble: 8 x 6
##   RoofMatl count    mean median    min    max
##   <fct>    <int>   <dbl>  <dbl>  <dbl>  <dbl>
## 1 ClyTile      1 160000  160000 160000 160000
## 2 CompShg   1434 179804. 162000  34900 745000
## 3 Membran      1 241500  241500 241500 241500
## 4 Metal        1 180000  180000 180000 180000
## 5 Roll         1 137000  137000 137000 137000
## 6 Tar&Grv     11 185406. 167000  82000 274970
## 7 WdShake      5 241400  242000 190000 287000
## 8 WdShngl      6 390250  332500 168500 755000
dat %>% filter(is.na(SalePrice)) %>% group_by(RoofMatl) %>% tally()
## # A tibble: 4 x 2
##   RoofMatl     n
##   <fct>    <int>
## 1 CompShg   1442
## 2 Tar&Grv     12
## 3 WdShake      4
## 4 WdShngl      1
dat %>% filter(!is.na(SalePrice)) %>% group_by(RoofMatl) %>% tally()
## # A tibble: 8 x 2
##   RoofMatl     n
##   <fct>    <int>
## 1 ClyTile      1
## 2 CompShg   1434
## 3 Membran      1
## 4 Metal        1
## 5 Roll         1
## 6 Tar&Grv     11
## 7 WdShake      5
## 8 WdShngl      6
train.test.graph("RoofMatl", dat)

#notice test set doesn't have ClyTile / Membran / Metal / Roll

#ClyTile -> CompShg
#Membran -> WdShake
#Metal -> CompShg
#Roll -> CompShg

dat$RoofMatl <- as.character(dat$RoofMatl)
dat$RoofMatl[dat$RoofMatl %in% c("ClyTile", "Metal", "Roll")] <- "CompShg"
dat$RoofMatl[dat$RoofMatl=="Membran"] <- "WdShake"


table(dat$RoofMatl)
## 
## CompShg Tar&Grv WdShake WdShngl 
##    2879      23      10       7
train.test.graph("RoofMatl", dat)

dat %>% select(contains("Roof")) %>% mutate_if(is.character, as.factor) %>% summary
##    RoofStyle       RoofMatl   
##  Flat   :  20   CompShg:2879  
##  Gable  :2310   Tar&Grv:  23  
##  Gambrel:  22   WdShake:  10  
##  Hip    : 551   WdShngl:   7  
##  Mansard:  11                 
##  Shed   :   5

Exterior..

dat %>% select(contains("Exter")) %>% summary
##   Exterior1st    Exterior2nd   ExterQual ExterCond
##  VinylSd:1026   VinylSd:1015   Ex: 107   Ex:  12  
##  MetalSd: 450   MetalSd: 447   Fa:  35   Fa:  67  
##  HdBoard: 442   HdBoard: 406   Gd: 979   Gd: 299  
##  Wd Sdng: 411   Wd Sdng: 391   TA:1798   Po:   3  
##  Plywood: 221   Plywood: 270             TA:2538  
##  CemntBd: 126   CmentBd: 126                      
##  (Other): 243   (Other): 264
#Exterior1st and Exterior2nd 
table(dat$Exterior1st, dat$Exterior2nd)
##          
##           AsbShng AsphShn Brk Cmn BrkFace CBlock CmentBd HdBoard ImStucc
##   AsbShng      35       0       0       0      0       1       0       0
##   AsphShn       0       2       0       0      0       0       0       0
##   BrkComm       0       0       4       0      0       0       0       0
##   BrkFace       1       0       0      44      0       0       3       0
##   CBlock        0       0       0       0      1       0       0       0
##   CemntBd       0       0       0       0      0     124       0       0
##   HdBoard       0       1       0       1      0       0     383       6
##   ImStucc       0       0       0       0      0       0       0       1
##   MetalSd       0       1       0       0      1       0       3       0
##   Plywood       0       0      18       0      1       0       6       4
##   Stone         0       0       0       0      0       0       1       0
##   Stucco        0       0       0       1      0       1       0       0
##   VinylSd       1       0       0       0      0       0       1       1
##   Wd Sdng       1       0       0       1      0       0       7       3
##   WdShing       0       0       0       0      0       0       2       0
##          
##           MetalSd Other Plywood Stone Stucco VinylSd Wd Sdng Wd Shng
##   AsbShng       0     0       5     0      1       1       1       0
##   AsphShn       0     0       0     0      0       0       0       0
##   BrkComm       0     0       0     0      1       0       1       0
##   BrkFace       3     0       8     3      3       1      20       1
##   CBlock        0     0       0     0      0       1       0       0
##   CemntBd       0     0       0     0      0       0       1       1
##   HdBoard       1     0      41     0      1       0       3       5
##   ImStucc       0     0       0     0      0       0       0       0
##   MetalSd     437     0       0     0      3       2       2       1
##   Plywood       0     0     186     0      0       1       4       1
##   Stone         0     0       0     1      0       0       0       0
##   Stucco        0     0       1     1     33       0       2       4
##   VinylSd       2     1       2     0      1    1006       2       9
##   Wd Sdng       4     0      17     1      3       3     353      18
##   WdShing       0     0      10     0      1       0       2      41
table(dat$Exterior1st)
## 
## AsbShng AsphShn BrkComm BrkFace  CBlock CemntBd HdBoard ImStucc MetalSd 
##      44       2       6      87       2     126     442       1     450 
## Plywood   Stone  Stucco VinylSd Wd Sdng WdShing 
##     221       2      43    1026     411      56
table(dat$Exterior2nd)
## 
## AsbShng AsphShn Brk Cmn BrkFace  CBlock CmentBd HdBoard ImStucc MetalSd 
##      38       4      22      47       3     126     406      15     447 
##   Other Plywood   Stone  Stucco VinylSd Wd Sdng Wd Shng 
##       1     270       6      47    1015     391      81
#Exterior 1st visualization 
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=Exterior1st, y=SalePrice, fill=Exterior1st)) +
  geom_boxplot()+
  theme(axis.text.x = element_text(angle=90))

#Exterior 2nd visualization
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=Exterior2nd, y=SalePrice, fill=Exterior2nd)) +
  geom_boxplot()+
  theme(axis.text.x = element_text(angle=90))

#combine exterior1st and exterior2nd
dat$Exterior <- NA
dat$Exterior1st <- as.character(dat$Exterior1st)
dat$Exterior2nd <- as.character(dat$Exterior2nd)
for(i in 1:nrow(dat)){
  if(dat$Exterior1st[i] == dat$Exterior2nd[i]){
    dat$Exterior[i] <- "same Exter"
  }
  if(dat$Exterior1st[i] != dat$Exterior2nd[i]){
    dat$Exterior[i] <- "more option Exter"
  }
}

table(dat$Exterior)
## 
## more option Exter        same Exter 
##               437              2482
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=Exterior, y=SalePrice, fill=Exterior))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(Exterior) %>% summarise(count = n(),
                                   mean = mean(SalePrice),
                                   median = median(SalePrice),
                                   sd = sd(SalePrice))
## # A tibble: 2 x 5
##   Exterior          count    mean median     sd
##   <chr>             <int>   <dbl>  <dbl>  <dbl>
## 1 more option Exter   215 182820. 151000 97887.
## 2 same Exter         1245 180593. 165000 75847.
#group mean almost same 





#Exterior 1st
dat %>% filter(!is.na(SalePrice)) %>%
  group_by(Exterior1st) %>% summarise(count = n(),
                                      mean = mean(SalePrice),
                                      median = median(SalePrice),
                                      sd = sd(SalePrice)) %>% arrange(median)
## # A tibble: 15 x 5
##    Exterior1st count    mean  median      sd
##    <chr>       <int>   <dbl>   <dbl>   <dbl>
##  1 BrkComm         2  71000   71000   15556.
##  2 AsphShn         1 100000  100000      NA 
##  3 CBlock          1 105000  105000      NA 
##  4 AsbShng        20 107386. 108000   33756.
##  5 WdShing        26 150655. 128700   72508.
##  6 Wd Sdng       206 149842. 138944.  71130.
##  7 MetalSd       220 149422. 139000   54776.
##  8 Stucco         25 162990  144000   83307.
##  9 HdBoard       222 163077. 149900   66306.
## 10 BrkFace        50 194573  165750   82842.
## 11 Plywood       108 175942. 167450   49497.
## 12 VinylSd       515 213733. 200000   80647.
## 13 CemntBd        61 231691. 236500  120576.
## 14 Stone           2 258500  258500   40305.
## 15 ImStucc         1 262000  262000      NA
dat %>% filter(is.na(SalePrice)) %>%
  group_by(Exterior1st) %>% tally()
## # A tibble: 13 x 2
##    Exterior1st     n
##    <chr>       <int>
##  1 AsbShng        24
##  2 AsphShn         1
##  3 BrkComm         4
##  4 BrkFace        37
##  5 CBlock          1
##  6 CemntBd        65
##  7 HdBoard       220
##  8 MetalSd       230
##  9 Plywood       113
## 10 Stucco         18
## 11 VinylSd       511
## 12 Wd Sdng       205
## 13 WdShing        30
dat$Exterior1st <- as.character(dat$Exterior1st)

train.ext1.level <-  levels(as.factor(dat$Exterior1st[!is.na(dat$SalePrice)]))

test.ext1.level <- levels(as.factor(dat$Exterior1st[is.na(dat$SalePrice)]))

train.ext1.level[!train.ext1.level %in% test.ext1.level]
## [1] "ImStucc" "Stone"
#only training set has ImStucc and Stone

dat$Exterior1st[dat$Exterior1st == "ImStucc" | dat$Exterior1st == "Stone"] <- "CemntBd"

ext1 <- dat %>% filter(!is.na(SalePrice)) %>%
  group_by(Exterior1st) %>% summarise(count = n(),
                                      mean = mean(SalePrice),
                                      median = median(SalePrice),
                                      sd = sd(SalePrice)) %>% arrange(median)
ext1
## # A tibble: 13 x 5
##    Exterior1st count    mean  median      sd
##    <chr>       <int>   <dbl>   <dbl>   <dbl>
##  1 BrkComm         2  71000   71000   15556.
##  2 AsphShn         1 100000  100000      NA 
##  3 CBlock          1 105000  105000      NA 
##  4 AsbShng        20 107386. 108000   33756.
##  5 WdShing        26 150655. 128700   72508.
##  6 Wd Sdng       206 149842. 138944.  71130.
##  7 MetalSd       220 149422. 139000   54776.
##  8 Stucco         25 162990  144000   83307.
##  9 HdBoard       222 163077. 149900   66306.
## 10 BrkFace        50 194573  165750   82842.
## 11 Plywood       108 175942. 167450   49497.
## 12 VinylSd       515 213733. 200000   80647.
## 13 CemntBd        64 233002. 238750  117931.
ext1$Exterior1st[ext1$median < 130000]
## [1] "BrkComm" "AsphShn" "CBlock"  "AsbShng" "WdShing"
dat$Exterior1st[dat$Exterior1st %in% ext1$Exterior1st[ext1$median < 130000]] <- "WdShing"
dat$Exterior1st[dat$Exterior1st == "Stucco"] <- "HdBoard"

train.test.graph("Exterior1st", dat)

#Exterior 2nd

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(Exterior2nd) %>% summarise(count = n(),
                                      mean = mean(SalePrice),
                                      median = median(SalePrice),
                                      sd = sd(SalePrice)) %>% arrange(median)
## # A tibble: 16 x 5
##    Exterior2nd count    mean  median      sd
##    <chr>       <int>   <dbl>   <dbl>   <dbl>
##  1 CBlock          1 105000  105000      NA 
##  2 AsbShng        20 114061. 111000   42315.
##  3 Wd Sdng       197 148386. 138000   59893.
##  4 Wd Shng        38 161329. 138225   88974.
##  5 MetalSd       214 149803. 138750   55079.
##  6 AsphShn         3 138000  139000   37510.
##  7 Stucco         26 155905. 142000   74862.
##  8 Brk Cmn         7 126714. 147000   38693.
##  9 HdBoard       207 167662. 155000   70061.
## 10 BrkFace        25 195818  160000   95098.
## 11 Plywood       142 168112. 160750   46956.
## 12 Stone           5 158225. 177000   63533.
## 13 ImStucc        10 252070  187600  193177.
## 14 VinylSd       504 214432. 200070.  80708.
## 15 CmentBd        60 230094. 238750  116140.
## 16 Other           1 319000  319000      NA
dat %>% filter(is.na(SalePrice)) %>%
  group_by(Exterior2nd) %>% tally()
## # A tibble: 15 x 2
##    Exterior2nd     n
##    <chr>       <int>
##  1 AsbShng        18
##  2 AsphShn         1
##  3 Brk Cmn        15
##  4 BrkFace        22
##  5 CBlock          2
##  6 CmentBd        66
##  7 HdBoard       199
##  8 ImStucc         5
##  9 MetalSd       233
## 10 Plywood       128
## 11 Stone           1
## 12 Stucco         21
## 13 VinylSd       511
## 14 Wd Sdng       194
## 15 Wd Shng        43
dat$Exterior2nd <- as.character(dat$Exterior2nd)

train.ext2.level <-  levels(as.factor(dat$Exterior2nd[!is.na(dat$SalePrice)]))

test.ext2.level <- levels(as.factor(dat$Exterior2nd[is.na(dat$SalePrice)]))

train.ext2.level[!train.ext2.level %in% test.ext2.level]
## [1] "Other"
dat$Exterior2nd[dat$Exterior2nd == "Other"] <- "CmentBd"


ext2 <- dat %>% filter(!is.na(SalePrice)) %>%
  group_by(Exterior2nd) %>% summarise(count = n(),
                                      mean = mean(SalePrice),
                                      median = median(SalePrice),
                                      sd = sd(SalePrice)) %>% arrange(median)

ext2
## # A tibble: 15 x 5
##    Exterior2nd count    mean  median      sd
##    <chr>       <int>   <dbl>   <dbl>   <dbl>
##  1 CBlock          1 105000  105000      NA 
##  2 AsbShng        20 114061. 111000   42315.
##  3 Wd Sdng       197 148386. 138000   59893.
##  4 Wd Shng        38 161329. 138225   88974.
##  5 MetalSd       214 149803. 138750   55079.
##  6 AsphShn         3 138000  139000   37510.
##  7 Stucco         26 155905. 142000   74862.
##  8 Brk Cmn         7 126714. 147000   38693.
##  9 HdBoard       207 167662. 155000   70061.
## 10 BrkFace        25 195818  160000   95098.
## 11 Plywood       142 168112. 160750   46956.
## 12 Stone           5 158225. 177000   63533.
## 13 ImStucc        10 252070  187600  193177.
## 14 VinylSd       504 214432. 200070.  80708.
## 15 CmentBd        61 231551. 241000  115730.
dat$Exterior2nd[dat$Exterior2nd %in% ext2$Exterior2nd[ext2$median < 138000]] <- "Wd Sdng"
dat$Exterior2nd[dat$Exterior2nd == "Wd Shng"] <- "MetalSd"
dat$Exterior2nd[dat$Exterior2nd == "AsphShn"] <- "MetalSd"
dat$Exterior2nd[dat$Exterior2nd %in% c("Stucco", "Brk Cmn")] <- "HdBoard"
dat$Exterior2nd[dat$Exterior2nd %in% c("BrkFace", "Stone")] <- "Plywood"
dat$Exterior2nd[dat$Exterior2nd == "ImStucc"] <- "VinylSd"

train.test.graph("Exterior2nd", dat)

#ExterCond
table(dat$ExterCond)
## 
##   Ex   Fa   Gd   Po   TA 
##   12   67  299    3 2538
#ext.qual for converting the quality or condition as continuous variable

ext.qual <- c("Ex" = 5, 
              "Gd"= 4, 
              "TA" = 3, 
              "Fa" = 2, 
              "Po" = 1)

#ExterQual and ExterCond
dat %>% group_by(ExterQual, ExterCond) %>% tally()
## # A tibble: 16 x 3
## # Groups:   ExterQual [4]
##    ExterQual ExterCond     n
##    <fct>     <fct>     <int>
##  1 Ex        Ex            3
##  2 Ex        Gd            3
##  3 Ex        TA          101
##  4 Fa        Fa           19
##  5 Fa        Gd            1
##  6 Fa        Po            1
##  7 Fa        TA           14
##  8 Gd        Ex            3
##  9 Gd        Fa            1
## 10 Gd        Gd           75
## 11 Gd        TA          900
## 12 TA        Ex            6
## 13 TA        Fa           47
## 14 TA        Gd          220
## 15 TA        Po            2
## 16 TA        TA         1523
prop.func("ExterQual")
##            [,1]
## [1,] 0.03665639
## [2,] 0.01199041
## [3,] 0.33538883
## [4,] 0.61596437
prop.func("ExterCond")
##             [,1]
## [1,] 0.004110997
## [2,] 0.022953066
## [3,] 0.102432340
## [4,] 0.001027749
## [5,] 0.869475848
table(dat$ExterQual)
## 
##   Ex   Fa   Gd   TA 
##  107   35  979 1798
table(dat$ExterCond)
## 
##   Ex   Fa   Gd   Po   TA 
##   12   67  299    3 2538
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=ExterQual, y=SalePrice, fill=ExterQual))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=ExterCond, y=SalePrice, fill=ExterCond))+
  geom_boxplot()

#Exter Cond
dat %>% filter(!is.na(SalePrice)) %>% group_by(ExterCond) %>% tally()#train set has 1 poor
## # A tibble: 5 x 2
##   ExterCond     n
##   <fct>     <int>
## 1 Ex            3
## 2 Fa           28
## 3 Gd          146
## 4 Po            1
## 5 TA         1282
dat %>% filter(is.na(SalePrice)) %>% group_by(ExterCond) %>% tally()#train set has 2 poor
## # A tibble: 5 x 2
##   ExterCond     n
##   <fct>     <int>
## 1 Ex            9
## 2 Fa           39
## 3 Gd          153
## 4 Po            2
## 5 TA         1256
train.test.graph("ExterCond", dat)

dat$ExterCond <- ordTonum(dat$ExterCond, ext.qual)

table(dat$ExterCond)
## 
##    1    2    3    4    5 
##    3   67 2538  299   12
#ExterQual
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=ExterQual, y=SalePrice, fill=ExterQual))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>% group_by(ExterQual) %>% 
  summarise(count = n(),
            mean = mean(SalePrice),
            median = median(SalePrice),
            sd = sd(SalePrice))
## # A tibble: 4 x 5
##   ExterQual count    mean  median      sd
##   <fct>     <int>   <dbl>   <dbl>   <dbl>
## 1 Ex           52 367361. 364606. 116401.
## 2 Fa           14  87985.  82250   39827.
## 3 Gd          488 231634. 220000   71189.
## 4 TA          906 144341. 139450   42472.
dat %>% filter(is.na(SalePrice)) %>% group_by(ExterQual) %>% tally() 
## # A tibble: 4 x 2
##   ExterQual     n
##   <fct>     <int>
## 1 Ex           55
## 2 Fa           21
## 3 Gd          491
## 4 TA          892
dat %>% filter(!is.na(SalePrice)) %>% group_by(ExterQual) %>% tally() 
## # A tibble: 4 x 2
##   ExterQual     n
##   <fct>     <int>
## 1 Ex           52
## 2 Fa           14
## 3 Gd          488
## 4 TA          906
train.test.graph("ExterQual", dat)

table(dat$ExterQual)
## 
##   Ex   Fa   Gd   TA 
##  107   35  979 1798
dat$ExterQual <- ordTonum(dat$ExterQual, ext.qual)
## The following `from` values were not present in `x`: Po
dat <- dat %>% mutate_if(is.character, as.factor)
dat %>% select(contains("Ext")) %>% summary
##   Exterior1st    Exterior2nd     ExterQual       ExterCond    
##  VinylSd:1026   CmentBd: 127   Min.   :2.000   Min.   :1.000  
##  HdBoard: 485   HdBoard: 475   1st Qu.:3.000   1st Qu.:3.000  
##  MetalSd: 450   MetalSd: 532   Median :3.000   Median :3.000  
##  Wd Sdng: 411   Plywood: 323   Mean   :3.397   Mean   :3.086  
##  Plywood: 221   VinylSd:1030   3rd Qu.:4.000   3rd Qu.:3.000  
##  CemntBd: 129   Wd Sdng: 432   Max.   :5.000   Max.   :5.000  
##  (Other): 197                                                 
##               Exterior   
##  more option Exter: 437  
##  same Exter       :2482  
##                          
##                          
##                          
##                          
## 

Basement

dat %>% select(contains("Bsmt")) %>% summary
##         BsmtQual           BsmtCond         BsmtExposure 
##  Ex         : 258   Fa         : 104   Av         : 418  
##  Fa         :  89   Gd         : 123   Gd         : 276  
##  Gd         :1209   no basement:  79   Mn         : 239  
##  no basement:  79   Po         :   5   No         :1907  
##  TA         :1284   TA         :2608   no basement:  79  
##                                                          
##                                                          
##       BsmtFinType1   BsmtFinSF1          BsmtFinType2    BsmtFinSF2     
##  ALQ        :429   Min.   :   0.0   ALQ        :  53   Min.   :   0.00  
##  BLQ        :269   1st Qu.:   0.0   BLQ        :  68   1st Qu.:   0.00  
##  GLQ        :849   Median : 368.0   GLQ        :  34   Median :   0.00  
##  LwQ        :154   Mean   : 441.3   LwQ        :  87   Mean   :  49.57  
##  no basement: 79   3rd Qu.: 733.0   no basement:  79   3rd Qu.:   0.00  
##  Rec        :288   Max.   :5644.0   Rec        : 105   Max.   :1526.00  
##  Unf        :851                    Unf        :2493                    
##    BsmtUnfSF       TotalBsmtSF    BsmtFullBath     BsmtHalfBath    
##  Min.   :   0.0   Min.   :   0   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.: 220.0   1st Qu.: 793   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median : 467.0   Median : 989   Median :0.0000   Median :0.00000  
##  Mean   : 560.6   Mean   :1051   Mean   :0.4296   Mean   :0.06132  
##  3rd Qu.: 805.0   3rd Qu.:1302   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :2336.0   Max.   :6110   Max.   :3.0000   Max.   :2.00000  
## 
#bsmt.qual for converting basement quality and condition as continuous variable
bsmt.qual <- c("Ex" = 5, 
              "Gd"= 4, 
              "TA" = 3, 
              "Fa" = 2, 
              "Po" = 1,
              "no basement" = 0)


bsmt.qual
##          Ex          Gd          TA          Fa          Po no basement 
##           5           4           3           2           1           0
#BsmtQual
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=BsmtQual, y=SalePrice, fill=BsmtQual))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(BsmtQual) %>%
  summarise(count = n(),
            mean = mean(SalePrice),
            median = median(SalePrice),
            sd = sd(SalePrice))
## # A tibble: 5 x 5
##   BsmtQual    count    mean median      sd
##   <fct>       <int>   <dbl>  <dbl>   <dbl>
## 1 Ex            121 327041. 318000 113563.
## 2 Fa             35 115692. 112000  34470.
## 3 Gd            618 202688. 192070  58092.
## 4 no basement    37 105653. 101800  29279.
## 5 TA            649 140760. 135500  43483.
dat %>% filter(is.na(SalePrice)) %>% group_by(BsmtQual) %>% tally()
## # A tibble: 5 x 2
##   BsmtQual        n
##   <fct>       <int>
## 1 Ex            137
## 2 Fa             54
## 3 Gd            591
## 4 no basement    42
## 5 TA            635
train.test.graph("BsmtQual", dat)

dat$BsmtQual <- ordTonum(dat$BsmtQual, bsmt.qual)
## The following `from` values were not present in `x`: Po
#BsmtCond
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=BsmtCond, y= SalePrice, fill=BsmtCond))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(BsmtCond) %>%
  summarise(count = n(),
            mean = mean(SalePrice),
            median = median(SalePrice),
            sd = sd(SalePrice))
## # A tibble: 5 x 5
##   BsmtCond    count    mean median     sd
##   <fct>       <int>   <dbl>  <dbl>  <dbl>
## 1 Fa             45 121810. 118500 43468.
## 2 Gd             65 213600. 193879 72663.
## 3 no basement    37 105653. 101800 29279.
## 4 Po              2  64000   64000  4243.
## 5 TA           1311 183633. 165000 79515.
dat %>% filter(!is.na(SalePrice)) %>% group_by(BsmtCond) %>% tally() #train has 2 Po
## # A tibble: 5 x 2
##   BsmtCond        n
##   <fct>       <int>
## 1 Fa             45
## 2 Gd             65
## 3 no basement    37
## 4 Po              2
## 5 TA           1311
dat %>% filter(is.na(SalePrice)) %>% group_by(BsmtCond) %>% tally() #test has 3 Po
## # A tibble: 5 x 2
##   BsmtCond        n
##   <fct>       <int>
## 1 Fa             59
## 2 Gd             58
## 3 no basement    42
## 4 Po              3
## 5 TA           1297
train.test.graph("BsmtCond", dat)

dat$BsmtCond <- ordTonum(dat$BsmtCond, bsmt.qual)
## The following `from` values were not present in `x`: Ex
#BsmtFinSF1, BsmtFinSF2, BsmtUnfSF

dat %>% select(BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF) %>% head
##   BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1        706          0       150         856
## 2        978          0       284        1262
## 3        486          0       434         920
## 4        216          0       540         756
## 5        655          0       490        1145
## 6        732          0        64         796
cor((dat$BsmtFinSF1 + dat$BsmtFinSF2 + dat$BsmtUnfSF), dat$TotalBsmtSF)
## [1] 1
#correlation value is 1, which means that the sum of 3 bsmt SF value is the same with TotalBsmtSF

train.test.graph("BsmtFinSF1", dat)

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=BsmtFinSF1, y=SalePrice)) + geom_jitter()

train.test.graph("BsmtFinSF2", dat)

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=BsmtFinSF2, y=SalePrice)) + geom_jitter()

train.test.graph("BsmtUnfSF", dat)

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=BsmtUnfSF, y=SalePrice)) + geom_jitter()

train.test.graph("TotalBsmtSF", dat)

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=TotalBsmtSF, y=SalePrice)) + geom_jitter()

which(dat$TotalBsmtSF>3000) #these 6 obs may be outliers
## [1]  333  441  497  524 1299 2550
#train have 5 outliers
#we have one outlier in testset 

dat %>% mutate_if(is.character, as.factor) %>% select(contains("Bsmt")) %>% summary
##     BsmtQual        BsmtCond          BsmtExposure       BsmtFinType1
##  Min.   :0.000   Min.   :0.000   Av         : 418   ALQ        :429  
##  1st Qu.:3.000   1st Qu.:3.000   Gd         : 276   BLQ        :269  
##  Median :4.000   Median :3.000   Mn         : 239   GLQ        :849  
##  Mean   :3.479   Mean   :2.922   No         :1907   LwQ        :154  
##  3rd Qu.:4.000   3rd Qu.:3.000   no basement:  79   no basement: 79  
##  Max.   :5.000   Max.   :4.000                      Rec        :288  
##                                                     Unf        :851  
##    BsmtFinSF1          BsmtFinType2    BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   ALQ        :  53   Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   BLQ        :  68   1st Qu.:   0.00   1st Qu.: 220.0  
##  Median : 368.0   GLQ        :  34   Median :   0.00   Median : 467.0  
##  Mean   : 441.3   LwQ        :  87   Mean   :  49.57   Mean   : 560.6  
##  3rd Qu.: 733.0   no basement:  79   3rd Qu.:   0.00   3rd Qu.: 805.0  
##  Max.   :5644.0   Rec        : 105   Max.   :1526.00   Max.   :2336.0  
##                   Unf        :2493                                     
##   TotalBsmtSF    BsmtFullBath     BsmtHalfBath    
##  Min.   :   0   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.: 793   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median : 989   Median :0.0000   Median :0.00000  
##  Mean   :1051   Mean   :0.4296   Mean   :0.06132  
##  3rd Qu.:1302   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :6110   Max.   :3.0000   Max.   :2.00000  
## 
#will see the Bsmt Bath with other Bath vars

Heating

dat %>% select(Heating, HeatingQC, CentralAir) %>% summary
##   Heating     HeatingQC CentralAir
##  Floor:   1   Ex:1493   N: 196    
##  GasA :2874   Fa:  92   Y:2723    
##  GasW :  27   Gd: 474             
##  Grav :   9   Po:   3             
##  OthW :   2   TA: 857             
##  Wall :   6
#Heating
prop.func("Heating") #quite skewed
##              [,1]
## [1,] 0.0003425831
## [2,] 0.9845837616
## [3,] 0.0092497431
## [4,] 0.0030832477
## [5,] 0.0006851662
## [6,] 0.0020554985
dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=Heating, y=SalePrice, fill=Heating)) + geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>% group_by(Heating) %>% tally()
## # A tibble: 6 x 2
##   Heating     n
##   <fct>   <int>
## 1 Floor       1
## 2 GasA     1428
## 3 GasW       18
## 4 Grav        7
## 5 OthW        2
## 6 Wall        4
dat %>% filter(is.na(SalePrice)) %>% group_by(Heating) %>% tally()
## # A tibble: 4 x 2
##   Heating     n
##   <fct>   <int>
## 1 GasA     1446
## 2 GasW        9
## 3 Grav        2
## 4 Wall        2
#combining Heating factors
dat$isHeating <- ifelse(dat$Heating == "GasA", "GasA", "Other")
table(dat$isHeating)
## 
##  GasA Other 
##  2874    45
prop.func("isHeating")
##            [,1]
## [1,] 0.98458376
## [2,] 0.01541624
train.test.graph("Heating", dat)

train.test.graph("isHeating", dat)

#testset doesn't have Floor / OthW
#Floor / OthW -> GasA
dat$Heating <- as.character(dat$Heating)
dat$Heating[dat$Heating == "Floor"] <- "GasA"
dat$Heating[dat$Heating == "OthW"] <- "GasA"

train.test.graph("Heating", dat)

table(dat$Heating)
## 
## GasA GasW Grav Wall 
## 2877   27    9    6
#HeatingQC
prop.func("HeatingQC")
##             [,1]
## [1,] 0.511476533
## [2,] 0.031517643
## [3,] 0.162384378
## [4,] 0.001027749
## [5,] 0.293593696
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=HeatingQC, y=SalePrice, fill=HeatingQC))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>% group_by(HeatingQC) %>% tally()
## # A tibble: 5 x 2
##   HeatingQC     n
##   <fct>     <int>
## 1 Ex          741
## 2 Fa           49
## 3 Gd          241
## 4 Po            1
## 5 TA          428
dat %>% filter(is.na(SalePrice)) %>% group_by(HeatingQC) %>% tally()
## # A tibble: 5 x 2
##   HeatingQC     n
##   <fct>     <int>
## 1 Ex          752
## 2 Fa           43
## 3 Gd          233
## 4 Po            2
## 5 TA          429
dat$HeatingQC[dat$HeatingQC == "Po"] <- "Fa"

dat %>% 
  filter(!is.na(SalePrice)) %>% 
  group_by(HeatingQC) %>%
  summarise(count = n(),
            mean = mean(SalePrice),
            median = median(SalePrice),
            sd = sd(SalePrice))
## # A tibble: 4 x 5
##   HeatingQC count    mean median     sd
##   <fct>     <int>   <dbl>  <dbl>  <dbl>
## 1 Ex          741 214914. 194700 87470.
## 2 Fa           50 123181. 122750 50064.
## 3 Gd          241 156859. 152000 52924.
## 4 TA          428 142363. 135000 47226.
train.test.graph("HeatingQC", dat)

table(dat$HeatingQC)
## 
##   Ex   Fa   Gd   Po   TA 
## 1493   95  474    0  857
#heat.qual for converting Heating Quality as a continuous variable
heat.qual <- c("Ex" = 5, 
          "Gd"= 4, 
          "TA" = 3, 
          "Fa" = 2, 
          "Po" = 1)

dat$HeatingQC <- ordTonum(dat$HeatingQC, heat.qual)
## The following `from` values were not present in `x`: Po
#CentralAir

prop.func("CentralAir")
##            [,1]
## [1,] 0.06714628
## [2,] 0.93285372
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=CentralAir, y=SalePrice, fill=CentralAir))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>% group_by(CentralAir) %>% tally()
## # A tibble: 2 x 2
##   CentralAir     n
##   <fct>      <int>
## 1 N             95
## 2 Y           1365
dat %>% filter(is.na(SalePrice)) %>% group_by(CentralAir) %>% tally()
## # A tibble: 2 x 2
##   CentralAir     n
##   <fct>      <int>
## 1 N            101
## 2 Y           1358
train.test.graph("CentralAir", dat)

Garage

garage.var <- dat %>% select(contains("Garage")) %>% summary
garage.var
##      GarageType    GarageYrBlt      GarageFinish    GarageCars   
##  2Types   :  23   Min.   :   0   Fin      : 719   Min.   :0.000  
##  Attchd   :1723   1st Qu.:1957   no garage: 157   1st Qu.:1.000  
##  Basment  :  36   Median :1977   RFn      : 811   Median :2.000  
##  BuiltIn  : 186   Mean   :1872   Unf      :1232   Mean   :1.767  
##  CarPort  :  15   3rd Qu.:2001                    3rd Qu.:2.000  
##  Detchd   : 779   Max.   :2207                    Max.   :5.000  
##  no garage: 157                                                  
##    GarageArea         GarageQual       GarageCond  
##  Min.   :   0.0   Ex       :   3   Ex       :   3  
##  1st Qu.: 320.0   Fa       : 124   Fa       :  74  
##  Median : 480.0   Gd       :  24   Gd       :  15  
##  Mean   : 472.8   no garage: 157   no garage: 157  
##  3rd Qu.: 576.0   Po       :   5   Po       :  14  
##  Max.   :1488.0   TA       :2606   TA       :2656  
## 
#Garage Year Built
table(dat$GarageYrBlt)
## 
##    0 1895 1896 1900 1906 1908 1910 1914 1915 1916 1917 1918 1919 1920 1921 
##  157    1    1    6    1    1   10    2    7    6    2    3    1   33    5 
## 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 
##    8    6    8   15   15    5    7    2   27    4    4    1    4    8    7 
## 1937 1938 1939 1940 1941 1942 1943 1945 1946 1947 1948 1949 1950 1951 1952 
##    6   11   21   25   14    6    1   10    9    5   19   14   51   17   16 
## 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 
##   23   37   24   41   34   42   36   37   31   37   34   35   34   39   36 
## 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 
##   48   32   32   24   27   29   35   28   50   66   41   35   32   15    9 
## 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 
##   11   19   18   12   18   20   19   26   17   27   49   39   35   40   44 
## 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2207 
##   58   54   55   41   53   92   99  142  115  115   61   29    5    1
#GarageYrBlt = 2207

dat %>% filter(GarageYrBlt == 2207) %>% head #it's in testset
##   MSSubClass MSZoning LotFrontage LotArea Street    Alley LotShape
## 1         20       RL          68    8298   Pave no alley      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         HLS    AllPub    Inside       Gtl       Timber       Norm
##   Condition2 OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle
## 1       Norm           8           5      2006         2007       Hip
##   RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual
## 1  CompShg     VinylSd     VinylSd       None          0         4
##   ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1
## 1         3      PConc        4        3           Av          GLQ
##   BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 1        583          Unf          0       963        1546    GasA
##   HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1         5          Y      SBrkr      1564         0            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1564            0            0        2        0            2
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Ex            6        Typ          1          Gd
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        2207          RFn          2        502         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         TA          Y        132           0             0          0
##   ScreenPorch PoolArea  PoolQC    Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0 no pool no fence        None       0      9   2007
##   SaleType SaleCondition SalePrice BldgType.new HouseStyle.new   Exterior
## 1      New       Partial        NA    high bldg   medium house same Exter
##   isHeating
## 1      GasA
#replace this as max value of other year values

dat %>% select(contains("Year"), contains("Yr")) %>% summary
##    YearBuilt     YearRemodAdd   GarageYrBlt       YrSold    
##  Min.   :1872   Min.   :1950   Min.   :   0   Min.   :2006  
##  1st Qu.:1954   1st Qu.:1965   1st Qu.:1957   1st Qu.:2007  
##  Median :1973   Median :1993   Median :1977   Median :2008  
##  Mean   :1971   Mean   :1984   Mean   :1872   Mean   :2008  
##  3rd Qu.:2001   3rd Qu.:2004   3rd Qu.:2001   3rd Qu.:2009  
##  Max.   :2010   Max.   :2010   Max.   :2207   Max.   :2010
#2010!

dat$GarageYrBlt[dat$GarageYrBlt == 2207] <- 2010


dat %>% 
  filter(!is.na(SalePrice) & GarageYrBlt != 0) %>% 
  group_by(GarageYrBlt) %>% 
  summarise(mean.grg = mean(SalePrice)) %>%
  ggplot(aes(x=as.factor(GarageYrBlt), y=mean.grg, group=1))+
  geom_line()+
  theme(axis.text.x = element_text(angle=90))

#GarageAge
#YrSold - GarageYrBlt

dat %>% mutate(GarageAge = YrSold - GarageYrBlt) %>% select(GarageAge) %>% summary
##    GarageAge     
##  Min.   :  -3.0  
##  1st Qu.:   7.0  
##  Median :  30.0  
##  Mean   : 136.2  
##  3rd Qu.:  51.0  
##  Max.   :2010.0
#negative value of GarageAge

dat %>% mutate(GarageAge = YrSold - GarageYrBlt) %>% filter(GarageAge < 0) %>% select(GarageAge, YrSold, GarageYrBlt, SalePrice) %>% head
##   GarageAge YrSold GarageYrBlt SalePrice
## 1        -1   2007        2008        NA
## 2        -3   2007        2010        NA
#These are in testset.. 
#replace the GarageYrBlt -> same value of YrSold
dat %>% mutate(GarageAge = YrSold - GarageYrBlt) %>% filter(GarageAge < 0) %>% mutate(GarageYrBlt = YrSold) %>% select(GarageAge, YrSold, GarageYrBlt, SalePrice) %>% head
##   GarageAge YrSold GarageYrBlt SalePrice
## 1        -1   2007        2007        NA
## 2        -3   2007        2007        NA
dat %>% mutate(GarageAge = YrSold - GarageYrBlt) %>% filter(GarageAge < 1000 & !is.na(SalePrice)) %>%
  ggplot(aes(x=GarageAge, y=SalePrice)) + geom_jitter()

#no garage values
dat %>% mutate(GarageAge = YrSold - GarageYrBlt) %>% filter(GarageAge > 1000) %>% select(GarageAge, YrSold, GarageYrBlt, SalePrice) %>% head
##   GarageAge YrSold GarageYrBlt SalePrice
## 1      2008   2008           0     82000
## 2      2009   2009           0    113000
## 3      2010   2010           0    136500
## 4      2009   2009           0     85000
## 5      2007   2007           0    123600
## 6      2010   2010           0    128950
dat %>% filter(GarageYrBlt==0) %>% group_by(GarageYrBlt) %>% tally
## # A tibble: 1 x 2
##   GarageYrBlt     n
##         <dbl> <int>
## 1           0   157
#no garage values -> GarageAge = sample(90:110, size = 1 , replace=TRUE)

dat$GarageAge <- NA
for(i in 1:nrow(dat)){
  if(dat$GarageYrBlt[i] == 0){
    dat$GarageAge[i] <- sample(90:110, 1)
  }
  if(dat$GarageYrBlt[i] > dat$YrSold[i]){
    dat$GarageYrBlt[i] <- dat$YrSold[i]
    
    dat$GarageAge[i] <- dat$YrSold[i] - dat$GarageYrBlt[i]
  }
  if(dat$GarageYrBlt[i] <= dat$YrSold[i] & dat$GarageYrBlt[i] != 0){
    dat$GarageAge[i] <- dat$YrSold[i] - dat$GarageYrBlt[i]
  }
}

summary(dat$GarageAge)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    7.00   30.00   33.55   51.00  114.00
train.test.graph("GarageAge", dat)

#GarageFinish

dat %>% 
  filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=GarageFinish, y=SalePrice, fill=GarageFinish))+
  geom_boxplot()

train.test.graph("GarageFinish",dat)

#Grgfin.qual for converting GarageFinish as continuous variable
grgfin.qual <- c("Fin" = 4, 
          "RFn"= 3, 
          "Unf" = 2, 
          "no garage" = 1)

dat$GarageFinish <- ordTonum(dat$GarageFinish, grgfin.qual)

#GarageCars
dat %>%
  filter(!is.na(SalePrice)) %>% mutate(GarageCars = as.factor(GarageCars)) %>%
  ggplot(aes(x=GarageCars, y=SalePrice, fill=GarageCars))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(GarageCars) %>%
  summarise(count = n(),
            mean = mean(SalePrice),
            median = median(SalePrice),
            sd = sd(SalePrice))
## # A tibble: 5 x 5
##   GarageCars count    mean median      sd
##        <dbl> <int>   <dbl>  <dbl>   <dbl>
## 1          0    81 103317. 100000  32815.
## 2          1   369 128117. 128000  30412.
## 3          2   824 183852. 177750  51617.
## 4          3   181 309636. 295000 106833.
## 5          4     5 192656. 200000  52622.
train.test.graph("GarageCars",dat)

#GarageArea

#log GarageArea
dat %>% mutate(GarageArea = log(GarageArea + 1)) %>%
  ggplot(aes(x=GarageArea, y=GarageCars)) + geom_jitter()

dat %>%
  filter(!is.na(SalePrice) & GarageType != "no garage") %>% 
  ggplot(aes(x=GarageArea, y=SalePrice, color=GarageType))+
  geom_jitter(alpha=0.5)+
  ggtitle("GarageArea vs SalePrice by GarageType")

dat %>%
  filter(!is.na(SalePrice) & GarageType != "no garage") %>% 
  ggplot(aes(x=GarageArea, y=SalePrice, color=as.factor(GarageCars)))+
  geom_jitter(alpha=0.5)+
  labs(color = "GarageCars")+
  ggtitle("GarageArea vs SalePrice by GarageCars")

dat %>%
  filter(!is.na(SalePrice) & GarageType != "no garage") %>% 
  ggplot(aes(x=GarageArea, y=SalePrice, color=GarageFinish))+
  geom_jitter(alpha=0.5)+
  ggtitle("GarageArea vs SalePrice by GarageFinish")

train.test.graph("GarageArea", dat)

#GarageQual
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=GarageQual, y=SalePrice, fill=GarageQual))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>% group_by(GarageQual) %>% summarise(count = n(),
                                                                         mean = mean(SalePrice),
                                                                         median = median(SalePrice),
                                                                         sd = sd(SalePrice))
## # A tibble: 6 x 5
##   GarageQual count    mean median      sd
##   <fct>      <int>   <dbl>  <dbl>   <dbl>
## 1 Ex             3 241000  127500 202680.
## 2 Fa            48 123573. 115000  42971.
## 3 Gd            14 215861. 209115  74127.
## 4 no garage     81 103317. 100000  32815.
## 5 Po             3 100167.  96500  35144.
## 6 TA          1311 187490. 170000  78775.
dat %>% filter(is.na(SalePrice)) %>% group_by(GarageQual) %>% tally()
## # A tibble: 5 x 2
##   GarageQual     n
##   <fct>      <int>
## 1 Fa            76
## 2 Gd            10
## 3 no garage     76
## 4 Po             2
## 5 TA          1295
dat %>% filter(!is.na(SalePrice)) %>% group_by(GarageQual) %>% tally()
## # A tibble: 6 x 2
##   GarageQual     n
##   <fct>      <int>
## 1 Ex             3
## 2 Fa            48
## 3 Gd            14
## 4 no garage     81
## 5 Po             3
## 6 TA          1311
train.test.graph("GarageQual", dat)

table(dat$GarageQual)
## 
##        Ex        Fa        Gd no garage        Po        TA 
##         3       124        24       157         5      2606
grg.qual <- c("Ex" = 5, 
          "Gd"= 4, 
          "TA" = 3, 
          "Fa" = 2, 
          "Po" = 1,
          "no garage" = 0)

table(ordTonum(dat$GarageQual, grg.qual))
## 
##    0    1    2    3    4    5 
##  157    5  124 2606   24    3
dat$GarageQual <- ordTonum(dat$GarageQual, grg.qual)

#GarageCond

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=GarageCond, y=SalePrice, fill=GarageCond))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>% group_by(GarageCond) %>% summarise(count = n(),
                                                                         mean = mean(SalePrice),
                                                                         median = median(SalePrice),
                                                                         sd = sd(SalePrice))
## # A tibble: 6 x 5
##   GarageCond count    mean median     sd
##   <fct>      <int>   <dbl>  <dbl>  <dbl>
## 1 Ex             2 124000  124000  4950.
## 2 Fa            35 114654. 114504 36421.
## 3 Gd             9 179930  148000 64769.
## 4 no garage     81 103317. 100000 32815.
## 5 Po             7 108500  108000 22662.
## 6 TA          1326 187886. 170000 79103.
dat %>% filter(is.na(SalePrice)) %>% group_by(GarageCond) %>% tally()
## # A tibble: 6 x 2
##   GarageCond     n
##   <fct>      <int>
## 1 Ex             1
## 2 Fa            39
## 3 Gd             6
## 4 no garage     76
## 5 Po             7
## 6 TA          1330
dat %>% filter(!is.na(SalePrice)) %>% group_by(GarageCond) %>% tally()
## # A tibble: 6 x 2
##   GarageCond     n
##   <fct>      <int>
## 1 Ex             2
## 2 Fa            35
## 3 Gd             9
## 4 no garage     81
## 5 Po             7
## 6 TA          1326
train.test.graph("GarageCond", dat)

table(dat$GarageCond)
## 
##        Ex        Fa        Gd no garage        Po        TA 
##         3        74        15       157        14      2656
dat$GarageCond <- ordTonum(dat$GarageCond, grg.qual)



#Garage Type
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=GarageType, y=SalePrice, fill=GarageType))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>% group_by(GarageType) %>% tally()
## # A tibble: 7 x 2
##   GarageType     n
##   <fct>      <int>
## 1 2Types         6
## 2 Attchd       870
## 3 Basment       19
## 4 BuiltIn       88
## 5 CarPort        9
## 6 Detchd       387
## 7 no garage     81
dat %>% filter(is.na(SalePrice)) %>% group_by(GarageType) %>% tally()
## # A tibble: 7 x 2
##   GarageType     n
##   <fct>      <int>
## 1 2Types        17
## 2 Attchd       853
## 3 Basment       17
## 4 BuiltIn       98
## 5 CarPort        6
## 6 Detchd       392
## 7 no garage     76
dat %>% filter(!is.na(SalePrice)) %>% group_by(GarageType) %>% summarise(count = n(),
                                                                         mean = mean(SalePrice),
                                                                         median = median(SalePrice),
                                                                         sd = sd(SalePrice))
## # A tibble: 7 x 5
##   GarageType count    mean median      sd
##   <fct>      <int>   <dbl>  <dbl>   <dbl>
## 1 2Types         6 151283. 159000  34917.
## 2 Attchd       870 202893. 185000  77147.
## 3 Basment       19 160571. 148000  63967.
## 4 BuiltIn       88 254752. 227500 102231.
## 5 CarPort        9 109962. 108000  24638.
## 6 Detchd       387 134091. 129500  41392.
## 7 no garage     81 103317. 100000  32815.
#Builtin <- high
#Attchd <- medium 1
#2Types + Basment <- medium 2
#carport + detchd <- low
#no garage <- no

dat$GarageType.new <- NA
dat$GarageType.new[dat$GarageType == "BuiltIn"] <- 3
dat$GarageType.new[dat$GarageType %in% c("2Types", "Basment", "Attchd")] <-2
dat$GarageType.new[dat$GarageType %in% c("CarPort", "Detchd")] <- 1
dat$GarageType.new[dat$GarageType == "no garage"] <- 0

table(dat$GarageType.new)
## 
##    0    1    2    3 
##  157  794 1782  186
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=GarageType.new, y=SalePrice))+
  geom_jitter()

train.test.graph("GarageType.new", dat)

dat %>% select(contains("garage")) %>% summary
##      GarageType    GarageYrBlt    GarageFinish     GarageCars   
##  2Types   :  23   Min.   :   0   Min.   :1.000   Min.   :0.000  
##  Attchd   :1723   1st Qu.:1957   1st Qu.:2.000   1st Qu.:1.000  
##  Basment  :  36   Median :1977   Median :3.000   Median :2.000  
##  BuiltIn  : 186   Mean   :1872   Mean   :2.717   Mean   :1.767  
##  CarPort  :  15   3rd Qu.:2001   3rd Qu.:3.000   3rd Qu.:2.000  
##  Detchd   : 779   Max.   :2010   Max.   :4.000   Max.   :5.000  
##  no garage: 157                                                 
##    GarageArea       GarageQual      GarageCond      GarageAge     
##  Min.   :   0.0   Min.   :0.000   Min.   :0.000   Min.   :  0.00  
##  1st Qu.: 320.0   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:  7.00  
##  Median : 480.0   Median :3.000   Median :3.000   Median : 30.00  
##  Mean   : 472.8   Mean   :2.803   Mean   :2.811   Mean   : 33.55  
##  3rd Qu.: 576.0   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.: 51.00  
##  Max.   :1488.0   Max.   :5.000   Max.   :5.000   Max.   :114.00  
##                                                                   
##  GarageType.new 
##  Min.   :0.000  
##  1st Qu.:1.000  
##  Median :2.000  
##  Mean   :1.684  
##  3rd Qu.:2.000  
##  Max.   :3.000  
## 
#remove GarageYrBlt
dat <- dat %>% subset(select = -c(GarageYrBlt))

Pool

dat %>% select(contains("Pool")) %>% colnames
## [1] "PoolArea" "PoolQC"
table(dat$PoolQC)
## 
##      Ex      Fa      Gd no pool 
##       4       2       4    2909
pool.qual <- c("Ex" = 5, 
          "Gd"= 4, 
          "TA" = 3, 
          "Fa" = 2, 
          "Po" = 1,
          "no pool" = 0)

dat$PoolQC <- ordTonum(dat$PoolQC, pool.qual)
## The following `from` values were not present in `x`: TA, Po
summary(dat$PoolArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   2.252   0.000 800.000
dat %>% filter(PoolArea > 0) %>% group_by(PoolArea) %>% tally()
## # A tibble: 13 x 2
##    PoolArea     n
##       <dbl> <int>
##  1      144     1
##  2      228     1
##  3      368     1
##  4      444     1
##  5      480     1
##  6      512     1
##  7      519     1
##  8      555     1
##  9      561     1
## 10      576     1
## 11      648     1
## 12      738     1
## 13      800     1
#only 13 obs recorded

dat %>% filter(!is.na(SalePrice) & PoolArea != 0) %>%
  ggplot(aes(x=PoolArea, y=SalePrice)) + geom_jitter()

Miscellaneous..

dat %>% select(contains("Misc")) %>% colnames
## [1] "MiscFeature" "MiscVal"
#MiscFeature
table(dat$MiscFeature)
## 
## Gar2 None Othr Shed TenC 
##    5 2814    4   95    1
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=MiscFeature, y=SalePrice, fill=MiscFeature))+
  geom_boxplot()

dat$noMiscFeature <- ifelse(dat$MiscFeature == "None", "None", "Misc")
table(dat$noMiscFeature)
## 
## Misc None 
##  105 2814
table(dat$MiscVal)
## 
##     0    54    80   300   350   400   420   450   455   460   480   490 
##  2816     1     1     1     1    18     1     9     1     1     2     1 
##   500   560   600   620   650   700   750   800   900  1000  1150  1200 
##    13     1     8     1     3     7     1     1     1     1     1     3 
##  1300  1400  1500  1512  2000  2500  3000  3500  4500  6500  8300 12500 
##     1     1     3     1     7     2     2     1     2     1     1     1 
## 15500 17000 
##     1     1
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=MiscVal, y=SalePrice))+
  geom_jitter()

train.test.graph("MiscFeature",dat)

train.test.graph("MiscVal",dat)

Porch

dat %>% select(contains("Porch")) %>% summary
##   OpenPorchSF     EnclosedPorch      X3SsnPorch       ScreenPorch    
##  Min.   :  0.00   Min.   :   0.0   Min.   :  0.000   Min.   :  0.00  
##  1st Qu.:  0.00   1st Qu.:   0.0   1st Qu.:  0.000   1st Qu.:  0.00  
##  Median : 26.00   Median :   0.0   Median :  0.000   Median :  0.00  
##  Mean   : 47.49   Mean   :  23.1   Mean   :  2.602   Mean   : 16.06  
##  3rd Qu.: 70.00   3rd Qu.:   0.0   3rd Qu.:  0.000   3rd Qu.:  0.00  
##  Max.   :742.00   Max.   :1012.0   Max.   :508.000   Max.   :576.00
#OpenPorchSF
dat %>% 
  filter(!is.na(SalePrice)) %>% 
  mutate(OpenPorchSF = OpenPorchSF, SalePrice = SalePrice) %>%
  ggplot(aes(x=OpenPorchSF, y=SalePrice))+
  geom_jitter()

train.test.graph("OpenPorchSF", dat)

#EnclosedPorch

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=EnclosedPorch, y=SalePrice))+
  geom_jitter()

train.test.graph("EnclosedPorch", dat)

#X3SsnPorch

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=X3SsnPorch, y=SalePrice))+
  geom_jitter()

train.test.graph("X3SsnPorch", dat)

#SCreenPorch

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=ScreenPorch, y=SalePrice))+
  geom_jitter()

train.test.graph("ScreenPorch", dat)

#combine all 4

dat %>% mutate(Porch = (OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch)) %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=log(Porch+1), y=log(SalePrice+1)))+
  geom_jitter()

dat <- dat %>% mutate(Porch = (OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch))

dat$Porch %>% summary
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   50.00   89.25  136.00 1207.00
train.test.graph("Porch", dat)

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=Porch, y=SalePrice)) + geom_jitter()

dat %>% select(contains("porch")) %>% colnames
## [1] "OpenPorchSF"   "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"  
## [5] "Porch"
porch <- dat %>% filter(!is.na(SalePrice)) %>% select_if(is.numeric) %>% select(contains("porch"), SalePrice)

cor(porch)
##                OpenPorchSF EnclosedPorch   X3SsnPorch ScreenPorch
## OpenPorchSF    1.000000000   -0.09307932 -0.005842499  0.07430394
## EnclosedPorch -0.093079318    1.00000000 -0.037305283 -0.08286424
## X3SsnPorch    -0.005842499   -0.03730528  1.000000000 -0.03143585
## ScreenPorch    0.074303944   -0.08286424 -0.031435847  1.00000000
## Porch          0.613542971    0.46808561  0.236688801  0.51995535
## SalePrice      0.315856227   -0.12857796  0.044583665  0.11144657
##                   Porch   SalePrice
## OpenPorchSF   0.6135430  0.31585623
## EnclosedPorch 0.4680856 -0.12857796
## X3SsnPorch    0.2366888  0.04458367
## ScreenPorch   0.5199553  0.11144657
## Porch         1.0000000  0.19573894
## SalePrice     0.1957389  1.00000000
dat$isPorch <- ifelse(dat$Porch != 0, "Porch", "no Porch")
table(dat$isPorch)
## 
## no Porch    Porch 
##      873     2046
dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=isPorch, y=SalePrice)) + geom_boxplot()

train.test.graph("isPorch", dat)

Year or Month..

dat %>% select(contains("Year")) %>% colnames
## [1] "YearBuilt"    "YearRemodAdd"
dat %>% select(contains("Yr")) %>% colnames
## [1] "YrSold"
dat %>% select(contains("Mo")) %>% colnames
## [1] "YearRemodAdd" "MoSold"
#YearBuilt / YearRemodAdd / GarageYear / YrSold / MoSold

#YearBuilt / YearRemodAdd / GarageYear / YrSold / MoSold

#YearBuilt 

table(dat$YearBuilt)
## 
## 1872 1875 1879 1880 1882 1885 1890 1892 1893 1895 1896 1898 1900 1901 1902 
##    1    1    1    5    1    2    7    2    1    3    1    1   29    2    1 
## 1904 1905 1906 1907 1908 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 
##    1    3    1    1    2   43    1    5    1    8   24   10    3   10    5 
## 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1934 1935 
##   57   11   16   17   16   34   19    9    9    8   26    7    5    5   13 
## 1936 1937 1938 1939 1940 1941 1942 1945 1946 1947 1948 1949 1950 1951 1952 
##   11    9   13   20   36   23    6   15   15   11   27   18   38   18   18 
## 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 
##   24   43   34   39   35   48   43   37   34   35   35   33   34   35   41 
## 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 
##   45   28   42   39   40   21   23   25   54   57   39   21   23    9    7 
## 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 
##    8   19    7   10    8   15    8   19   12   27   39   37   31   34   35 
## 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 
##   46   52   48   35   47   88   99  142  138  109   49   25    3
dat %>% select(contains("Year"),contains("Yr")) %>% summary
##    YearBuilt     YearRemodAdd      YrSold    
##  Min.   :1872   Min.   :1950   Min.   :2006  
##  1st Qu.:1954   1st Qu.:1965   1st Qu.:2007  
##  Median :1973   Median :1993   Median :2008  
##  Mean   :1971   Mean   :1984   Mean   :2008  
##  3rd Qu.:2001   3rd Qu.:2004   3rd Qu.:2009  
##  Max.   :2010   Max.   :2010   Max.   :2010
#more see for Yearbuilt and YearRemodadd

dat %>% select(contains("Year")) %>% head
##   YearBuilt YearRemodAdd
## 1      2003         2003
## 2      1976         1976
## 3      2001         2002
## 4      1915         1970
## 5      2000         2000
## 6      1993         1995
which(dat$YearBuilt == dat$YearRemodAdd) %>% head
## [1]  1  2  5  8 11 13
#I will create two variables from here

#Age : 2010 - YearRemodAdd
dat$Age <- dat$YrSold - dat$YearRemodAdd

dat %>% filter(Age < 0)
##   MSSubClass MSZoning LotFrontage LotArea Street    Alley LotShape
## 1         60       RL         130   40094   Pave no alley      IR1
## 2         60       RL         134   16659   Pave no alley      IR1
## 3         20       RL         128   39290   Pave no alley      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Bnk    AllPub    Inside       Gtl      Edwards       PosN
## 2         Lvl    AllPub    Corner       Gtl      NridgHt       Norm
## 3         Bnk    AllPub    Inside       Gtl      Edwards       Norm
##   Condition2 OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle
## 1       PosN          10           5      2007         2008       Hip
## 2       Norm           8           5      2007         2008     Gable
## 3       Norm          10           5      2008         2009       Hip
##   RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual
## 1  CompShg     CemntBd     CmentBd      Stone        762         5
## 2  CompShg     VinylSd     VinylSd       None          0         4
## 3  CompShg     CemntBd     CmentBd      Stone       1224         5
##   ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1
## 1         3      PConc        5        3           Gd          GLQ
## 2         3      PConc        4        3           No          Unf
## 3         3      PConc        5        3           Gd          GLQ
##   BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 1       2260          Unf          0       878        3138    GasA
## 2          0          Unf          0      1582        1582    GasA
## 3       4010          Unf          0      1085        5095    GasA
##   HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1         5          Y      SBrkr      3138      1538            0
## 2         5          Y      SBrkr      1582       570            0
## 3         5          Y      SBrkr      5095         0            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      4676            1            0        3        1            3
## 2      2152            0            0        2        1            3
## 3      5095            1            1        2        1            2
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Ex           11        Typ          1          Gd
## 2            1          Gd            7        Typ          1          Gd
## 3            1          Ex           15        Typ          2          Gd
##   GarageType GarageFinish GarageCars GarageArea GarageQual GarageCond
## 1    BuiltIn            4          3        884          3          3
## 2     Detchd            2          2        728          3          3
## 3     Attchd            4          3       1154          3          3
##   PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## 1          Y        208         406             0          0           0
## 2          Y          0         368             0          0           0
## 3          Y        546         484             0          0           0
##   PoolArea PoolQC    Fence MiscFeature MiscVal MoSold YrSold SaleType
## 1        0      0 no fence        None       0     10   2007      New
## 2        0      0 no fence        None       0      6   2007      New
## 3        0      0 no fence        None   17000     10   2007      New
##   SaleCondition SalePrice BldgType.new HouseStyle.new          Exterior
## 1       Partial    184750    high bldg     high house more option Exter
## 2       Partial        NA    high bldg     high house        same Exter
## 3       Partial        NA    high bldg   medium house more option Exter
##   isHeating GarageAge GarageType.new noMiscFeature Porch isPorch Age
## 1      GasA         0              3          None   406   Porch  -1
## 2      GasA         0              1          None   368   Porch  -1
## 3      GasA         0              2          None   484   Porch  -2
#These might be the houses that householder bought this before the remodeling completed. 
#adding 2 for all obs
dat$Age <- dat$Age + 2

train.test.graph("Age", dat)

#YearBuilt 

summary(dat$YearBuilt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1872    1954    1973    1971    2001    2010
dat %>% 
  filter(!is.na(SalePrice)) %>% 
  group_by(YearBuilt) %>% 
  summarise(mean.grg = mean(SalePrice)) %>%
  ggplot(aes(x=as.factor(YearBuilt), y=mean.grg, group=1))+
  geom_line()+
  theme(axis.text.x = element_text(angle=90))

dat$YearBuilt.deca <- NA
for(i in 1:nrow(dat)){
  dat$YearBuilt.deca[i] <- floor_decade(dat$YearBuilt[i])
}


summary(dat$YearBuilt.deca)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1870    1950    1970    1967    2000    2010
dat %>% 
  filter(!is.na(SalePrice)) %>% 
  group_by(YearBuilt.deca) %>% 
  summarise(mean.grg = mean(SalePrice)) %>%
  ggplot(aes(x=as.factor(YearBuilt.deca), y=mean.grg, group=1))+
  geom_line()+
  theme(axis.text.x = element_text(angle=90))

#YearremodAdd

summary(dat$YearRemodAdd)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1950    1965    1993    1984    2004    2010
dat %>% 
  filter(!is.na(SalePrice)) %>% 
  group_by(YearRemodAdd) %>% 
  summarise(mean.grg = mean(SalePrice)) %>%
  ggplot(aes(x=as.factor(YearRemodAdd), y=mean.grg, group=1))+
  geom_line()+
  theme(axis.text.x = element_text(angle=90))

dat$YearRemodAdd.deca <- NA
for(i in 1:nrow(dat)){
  dat$YearRemodAdd.deca[i] <- floor_decade(dat$YearRemodAdd[i])
}


summary(dat$YearRemodAdd.deca)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1950    1960    1990    1980    2000    2010
dat %>% 
  filter(!is.na(SalePrice)) %>% 
  group_by(YearRemodAdd.deca) %>% 
  summarise(mean.grg = mean(SalePrice)) %>%
  ggplot(aes(x=as.factor(YearRemodAdd.deca), y=mean.grg, group=1))+
  geom_line()+
  theme(axis.text.x = element_text(angle=90))

dat <- dat %>% 
  mutate(YearBuilt.deca = as.factor(YearBuilt.deca),
         YearRemodAdd.deca = as.factor(YearRemodAdd.deca)) %>%
  subset(select = -c(YearBuilt, YearRemodAdd))

dat %>% colnames
##  [1] "MSSubClass"        "MSZoning"          "LotFrontage"      
##  [4] "LotArea"           "Street"            "Alley"            
##  [7] "LotShape"          "LandContour"       "Utilities"        
## [10] "LotConfig"         "LandSlope"         "Neighborhood"     
## [13] "Condition1"        "Condition2"        "OverallQual"      
## [16] "OverallCond"       "RoofStyle"         "RoofMatl"         
## [19] "Exterior1st"       "Exterior2nd"       "MasVnrType"       
## [22] "MasVnrArea"        "ExterQual"         "ExterCond"        
## [25] "Foundation"        "BsmtQual"          "BsmtCond"         
## [28] "BsmtExposure"      "BsmtFinType1"      "BsmtFinSF1"       
## [31] "BsmtFinType2"      "BsmtFinSF2"        "BsmtUnfSF"        
## [34] "TotalBsmtSF"       "Heating"           "HeatingQC"        
## [37] "CentralAir"        "Electrical"        "X1stFlrSF"        
## [40] "X2ndFlrSF"         "LowQualFinSF"      "GrLivArea"        
## [43] "BsmtFullBath"      "BsmtHalfBath"      "FullBath"         
## [46] "HalfBath"          "BedroomAbvGr"      "KitchenAbvGr"     
## [49] "KitchenQual"       "TotRmsAbvGrd"      "Functional"       
## [52] "Fireplaces"        "FireplaceQu"       "GarageType"       
## [55] "GarageFinish"      "GarageCars"        "GarageArea"       
## [58] "GarageQual"        "GarageCond"        "PavedDrive"       
## [61] "WoodDeckSF"        "OpenPorchSF"       "EnclosedPorch"    
## [64] "X3SsnPorch"        "ScreenPorch"       "PoolArea"         
## [67] "PoolQC"            "Fence"             "MiscFeature"      
## [70] "MiscVal"           "MoSold"            "YrSold"           
## [73] "SaleType"          "SaleCondition"     "SalePrice"        
## [76] "BldgType.new"      "HouseStyle.new"    "Exterior"         
## [79] "isHeating"         "GarageAge"         "GarageType.new"   
## [82] "noMiscFeature"     "Porch"             "isPorch"          
## [85] "Age"               "YearBuilt.deca"    "YearRemodAdd.deca"
#YrSold and MoSold

table(dat$YrSold)
## 
## 2006 2007 2008 2009 2010 
##  619  692  622  647  339
table(dat$MoSold)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12 
## 122 133 232 279 394 503 446 233 158 173 142 104
dat %>% 
  filter(!is.na(SalePrice)) %>% 
  group_by(YrSold) %>% 
  summarise(mean.grg = mean(SalePrice)) %>%
  ggplot(aes(x=as.factor(YrSold), y=mean.grg, group=1))+
  geom_line()

dat %>% 
  filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=as.factor(YrSold), y=SalePrice, fill=as.factor(YrSold)))+
  geom_boxplot()

dat %>% 
  filter(!is.na(SalePrice)) %>% 
  group_by(MoSold) %>% 
  summarise(mean.grg = mean(SalePrice)) %>%
  ggplot(aes(x=as.factor(MoSold), y=mean.grg, group=1))+
  geom_line()

dat %>% 
  filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=as.factor(MoSold), y=SalePrice))+
  geom_boxplot()

dat %>% 
  filter(!is.na(SalePrice)) %>% mutate(MoSold = as.factor(MoSold), YrSold = as.factor(YrSold)) %>%
  group_by(MoSold, YrSold) %>% 
  summarise(mean.grg = mean(SalePrice)) %>%
  ggplot(aes(x=MoSold, y=mean.grg, group=1))+
  geom_line()+
  facet_wrap(~YrSold)

#Month..
#12~2/ 3~5 / 6~8 / 9~11

#creating Season variable
dat$Season <- NA
for(i in 1:nrow(dat)){
  if(dat$MoSold[i] %in% c(12,1,2)){
    dat$Season[i] <- "Winter"
  }
  if(dat$MoSold[i] %in% c(3,4,5)){
    dat$Season[i] <- "Spring"
  }
  if(dat$MoSold[i] %in% c(6,7,8)){
    dat$Season[i] <- "Summer"
  }
  if(dat$MoSold[i] %in% c(9,10,11)){
    dat$Season[i] <- "Fall"
  }
}

dat %>% 
  filter(!is.na(SalePrice)) %>% 
  group_by(Season) %>% 
  summarise(mean.grg = mean(SalePrice)) %>%
  ggplot(aes(x=Season, y=mean.grg, group=1))+
  geom_line()

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=Season, y=SalePrice, fill=Season)) + geom_boxplot()

table(dat$Season) 
## 
##   Fall Spring Summer Winter 
##    473    905   1182    359
train.test.graph("Season", dat)

#Summer has the largest number of counts 
#spring has the smallest number of counts

dat %>% select(contains("Year")) %>% str
## 'data.frame':    2919 obs. of  2 variables:
##  $ YearBuilt.deca   : Factor w/ 15 levels "1870","1880",..: 14 11 14 5 14 13 14 11 7 7 ...
##  $ YearRemodAdd.deca: Factor w/ 7 levels "1950","1960",..: 6 3 6 3 6 5 6 3 1 1 ...
dat %>% select(contains("Yr")) %>% str
## 'data.frame':    2919 obs. of  1 variable:
##  $ YrSold: num  2008 2007 2008 2006 2008 ...
dat %>% select(contains("Mo")) %>% str
## 'data.frame':    2919 obs. of  2 variables:
##  $ MoSold           : num  2 5 9 2 12 10 8 11 4 1 ...
##  $ YearRemodAdd.deca: Factor w/ 7 levels "1950","1960",..: 6 3 6 3 6 5 6 3 1 1 ...
#YrSold / MoSold / Season
#convert YrSold and MoSold -> Factor
dat <- dat %>% 
  mutate(YrSold = as.factor(YrSold), MoSold=as.factor(MoSold))


dat %>% mutate_if(is.character, as.factor) %>% summary
##    MSSubClass      MSZoning     LotFrontage        LotArea      
##  20     :1079   C (all):  25   Min.   : 21.00   Min.   :  1300  
##  60     : 575   FV     : 139   1st Qu.: 60.00   1st Qu.:  7478  
##  50     : 287   RH     :  26   Median : 70.00   Median :  9453  
##  120    : 182   RL     :2267   Mean   : 70.06   Mean   : 10168  
##  30     : 139   RM     : 462   3rd Qu.: 80.00   3rd Qu.: 11570  
##  70     : 128                  Max.   :313.00   Max.   :215245  
##  (Other): 529                                                   
##   Street          Alley      LotShape   LandContour  Utilities   
##  Grvl:  12   Gravel  : 120   IR1: 968   Bnk: 117    AllPub:2918  
##  Pave:2907   no alley:2721   IR2:  76   HLS: 120    NoSeWa:   1  
##              Paved   :  78   IR3:  16   Low:  60                 
##                              Reg:1859   Lvl:2622                 
##                                                                  
##                                                                  
##                                                                  
##    LotConfig    LandSlope   Neighborhood    Condition1     Condition2  
##  Corner : 511   Gtl:2778   NAmes  : 443   Norm   :2511   Norm   :2889  
##  CulDSac: 176   Mod: 125   CollgCr: 267   Feedr  : 164   Feedr  :  13  
##  FR2    :  85   Sev:  16   OldTown: 239   Artery :  92   Artery :   5  
##  FR3    :  14              Edwards: 194   RRAn   :  50   PosA   :   4  
##  Inside :2133              Somerst: 182   PosN   :  39   PosN   :   4  
##                            NridgHt: 166   RRAe   :  28   RRNn   :   2  
##                            (Other):1428   (Other):  35   (Other):   2  
##   OverallQual      OverallCond      RoofStyle       RoofMatl   
##  Min.   : 1.000   Min.   :1.000   Flat   :  20   CompShg:2879  
##  1st Qu.: 5.000   1st Qu.:5.000   Gable  :2310   Tar&Grv:  23  
##  Median : 6.000   Median :5.000   Gambrel:  22   WdShake:  10  
##  Mean   : 6.089   Mean   :5.565   Hip    : 551   WdShngl:   7  
##  3rd Qu.: 7.000   3rd Qu.:6.000   Mansard:  11                 
##  Max.   :10.000   Max.   :9.000   Shed   :   5                 
##                                                                
##   Exterior1st    Exterior2nd     MasVnrType     MasVnrArea    
##  VinylSd:1026   CmentBd: 127   BrkCmn :  26   Min.   :   0.0  
##  HdBoard: 485   HdBoard: 475   BrkFace: 879   1st Qu.:   0.0  
##  MetalSd: 450   MetalSd: 532   None   :1765   Median :   0.0  
##  Wd Sdng: 411   Plywood: 323   Stone  : 249   Mean   : 101.4  
##  Plywood: 221   VinylSd:1030                  3rd Qu.: 163.5  
##  CemntBd: 129   Wd Sdng: 432                  Max.   :1600.0  
##  (Other): 197                                                 
##    ExterQual       ExterCond      Foundation      BsmtQual    
##  Min.   :2.000   Min.   :1.000   BrkTil: 311   Min.   :0.000  
##  1st Qu.:3.000   1st Qu.:3.000   CBlock:1235   1st Qu.:3.000  
##  Median :3.000   Median :3.000   PConc :1308   Median :4.000  
##  Mean   :3.397   Mean   :3.086   Slab  :  49   Mean   :3.479  
##  3rd Qu.:4.000   3rd Qu.:3.000   Stone :  11   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Wood  :   5   Max.   :5.000  
##                                                               
##     BsmtCond          BsmtExposure       BsmtFinType1   BsmtFinSF1    
##  Min.   :0.000   Av         : 418   ALQ        :429   Min.   :   0.0  
##  1st Qu.:3.000   Gd         : 276   BLQ        :269   1st Qu.:   0.0  
##  Median :3.000   Mn         : 239   GLQ        :849   Median : 368.0  
##  Mean   :2.922   No         :1907   LwQ        :154   Mean   : 441.3  
##  3rd Qu.:3.000   no basement:  79   no basement: 79   3rd Qu.: 733.0  
##  Max.   :4.000                      Rec        :288   Max.   :5644.0  
##                                     Unf        :851                   
##       BsmtFinType2    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF  
##  ALQ        :  53   Min.   :   0.00   Min.   :   0.0   Min.   :   0  
##  BLQ        :  68   1st Qu.:   0.00   1st Qu.: 220.0   1st Qu.: 793  
##  GLQ        :  34   Median :   0.00   Median : 467.0   Median : 989  
##  LwQ        :  87   Mean   :  49.57   Mean   : 560.6   Mean   :1051  
##  no basement:  79   3rd Qu.:   0.00   3rd Qu.: 805.0   3rd Qu.:1302  
##  Rec        : 105   Max.   :1526.00   Max.   :2336.0   Max.   :6110  
##  Unf        :2493                                                    
##  Heating       HeatingQC     CentralAir Electrical     X1stFlrSF   
##  GasA:2877   Min.   :2.000   N: 196     FuseA: 188   Min.   : 334  
##  GasW:  27   1st Qu.:3.000   Y:2723     FuseF:  50   1st Qu.: 876  
##  Grav:   9   Median :5.000              FuseP:   8   Median :1082  
##  Wall:   6   Mean   :4.153              Mix  :   1   Mean   :1160  
##              3rd Qu.:5.000              SBrkr:2672   3rd Qu.:1388  
##              Max.   :5.000                           Max.   :5095  
##                                                                    
##    X2ndFlrSF       LowQualFinSF        GrLivArea     BsmtFullBath   
##  Min.   :   0.0   Min.   :   0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0.0   1st Qu.:   0.000   1st Qu.:1126   1st Qu.:0.0000  
##  Median :   0.0   Median :   0.000   Median :1444   Median :0.0000  
##  Mean   : 336.5   Mean   :   4.694   Mean   :1501   Mean   :0.4296  
##  3rd Qu.: 704.0   3rd Qu.:   0.000   3rd Qu.:1744   3rd Qu.:1.0000  
##  Max.   :2065.0   Max.   :1064.000   Max.   :5642   Max.   :3.0000  
##                                                                     
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr 
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.00  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.00  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.00  
##  Mean   :0.06132   Mean   :1.568   Mean   :0.3803   Mean   :2.86  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.00  
##  Max.   :2.00000   Max.   :4.000   Max.   :2.0000   Max.   :8.00  
##                                                                   
##   KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional    Fireplaces    
##  Min.   :0.000   Ex: 205     Min.   : 2.000   Maj1:  19   Min.   :0.0000  
##  1st Qu.:1.000   Fa:  70     1st Qu.: 5.000   Maj2:   9   1st Qu.:0.0000  
##  Median :1.000   Gd:1151     Median : 6.000   Min1:  65   Median :1.0000  
##  Mean   :1.045   TA:1493     Mean   : 6.452   Min2:  70   Mean   :0.5971  
##  3rd Qu.:1.000               3rd Qu.: 7.000   Mod :  35   3rd Qu.:1.0000  
##  Max.   :3.000               Max.   :15.000   Sev :   2   Max.   :4.0000  
##                                               Typ :2719                   
##        FireplaceQu       GarageType    GarageFinish     GarageCars   
##  Ex          :  43   2Types   :  23   Min.   :1.000   Min.   :0.000  
##  Fa          :  74   Attchd   :1723   1st Qu.:2.000   1st Qu.:1.000  
##  Gd          : 744   Basment  :  36   Median :3.000   Median :2.000  
##  no fireplace:1420   BuiltIn  : 186   Mean   :2.717   Mean   :1.767  
##  Po          :  46   CarPort  :  15   3rd Qu.:3.000   3rd Qu.:2.000  
##  TA          : 592   Detchd   : 779   Max.   :4.000   Max.   :5.000  
##                      no garage: 157                                  
##    GarageArea       GarageQual      GarageCond    PavedDrive
##  Min.   :   0.0   Min.   :0.000   Min.   :0.000   N: 216    
##  1st Qu.: 320.0   1st Qu.:3.000   1st Qu.:3.000   P:  62    
##  Median : 480.0   Median :3.000   Median :3.000   Y:2641    
##  Mean   : 472.8   Mean   :2.803   Mean   :2.811             
##  3rd Qu.: 576.0   3rd Qu.:3.000   3rd Qu.:3.000             
##  Max.   :1488.0   Max.   :5.000   Max.   :5.000             
##                                                             
##    WoodDeckSF       OpenPorchSF     EnclosedPorch      X3SsnPorch     
##  Min.   :   0.00   Min.   :  0.00   Min.   :   0.0   Min.   :  0.000  
##  1st Qu.:   0.00   1st Qu.:  0.00   1st Qu.:   0.0   1st Qu.:  0.000  
##  Median :   0.00   Median : 26.00   Median :   0.0   Median :  0.000  
##  Mean   :  93.71   Mean   : 47.49   Mean   :  23.1   Mean   :  2.602  
##  3rd Qu.: 168.00   3rd Qu.: 70.00   3rd Qu.:   0.0   3rd Qu.:  0.000  
##  Max.   :1424.00   Max.   :742.00   Max.   :1012.0   Max.   :508.000  
##                                                                       
##   ScreenPorch        PoolArea           PoolQC            Fence     
##  Min.   :  0.00   Min.   :  0.000   Min.   :0.0000   GdPrv   : 118  
##  1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.:0.0000   GdWo    : 112  
##  Median :  0.00   Median :  0.000   Median :0.0000   MnPrv   : 329  
##  Mean   : 16.06   Mean   :  2.252   Mean   :0.0137   MnWw    :  12  
##  3rd Qu.:  0.00   3rd Qu.:  0.000   3rd Qu.:0.0000   no fence:2348  
##  Max.   :576.00   Max.   :800.000   Max.   :5.0000                  
##                                                                     
##  MiscFeature    MiscVal             MoSold     YrSold       SaleType   
##  Gar2:   5   Min.   :    0.00   6      :503   2006:619   WD     :2526  
##  None:2814   1st Qu.:    0.00   7      :446   2007:692   New    : 239  
##  Othr:   4   Median :    0.00   5      :394   2008:622   COD    :  87  
##  Shed:  95   Mean   :   50.83   4      :279   2009:647   ConLD  :  26  
##  TenC:   1   3rd Qu.:    0.00   8      :233   2010:339   CWD    :  12  
##              Max.   :17000.00   3      :232              ConLI  :   9  
##                                 (Other):832              (Other):  20  
##  SaleCondition    SalePrice         BldgType.new       HouseStyle.new
##  Abnorml: 190   Min.   : 34900   high bldg:2652   high house  : 880  
##  AdjLand:  12   1st Qu.:129975   low bldg : 267   low house   : 440  
##  Alloca :  24   Median :163000                    medium house:1599  
##  Family :  46   Mean   :180921                                       
##  Normal :2402   3rd Qu.:214000                                       
##  Partial: 245   Max.   :755000                                       
##                 NA's   :1459                                         
##               Exterior    isHeating      GarageAge      GarageType.new 
##  more option Exter: 437   GasA :2874   Min.   :  0.00   Min.   :0.000  
##  same Exter       :2482   Other:  45   1st Qu.:  7.00   1st Qu.:1.000  
##                                        Median : 30.00   Median :2.000  
##                                        Mean   : 33.55   Mean   :1.684  
##                                        3rd Qu.: 51.00   3rd Qu.:2.000  
##                                        Max.   :114.00   Max.   :3.000  
##                                                                        
##  noMiscFeature     Porch             isPorch          Age       
##  Misc: 105     Min.   :   0.00   no Porch: 873   Min.   : 0.00  
##  None:2814     1st Qu.:   0.00   Porch   :2046   1st Qu.: 6.00  
##                Median :  50.00                   Median :17.00  
##                Mean   :  89.25                   Mean   :25.53  
##                3rd Qu.: 136.00                   3rd Qu.:45.00  
##                Max.   :1207.00                   Max.   :62.00  
##                                                                 
##  YearBuilt.deca YearRemodAdd.deca    Season    
##  2000   :780    1950: 577         Fall  : 473  
##  1970   :361    1960: 289         Spring: 905  
##  1960   :357    1970: 334         Summer:1182  
##  1950   :340    1980: 152         Winter: 359  
##  1990   :332    1990: 487                      
##  1920   :196    2000:1067                      
##  (Other):553    2010:  13

Floor SF

#Floor SF
#X1stFlrSF / X2ndFlrSF / LowQualFinSF / GrLivArea

dat %>% select(X1stFlrSF, X2ndFlrSF, LowQualFinSF, GrLivArea) %>% head
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea
## 1       856       854            0      1710
## 2      1262         0            0      1262
## 3       920       866            0      1786
## 4       961       756            0      1717
## 5      1145      1053            0      2198
## 6       796       566            0      1362
cor((dat$X1stFlrSF + dat$X2ndFlrSF + dat$LowQualFinSF), dat$GrLivArea)
## [1] 1
#X1stFlrSF + X2ndFlrSF + LowQualFinSF = GrLivArea


train.test.graph("X1stFlrSF", dat)

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=X1stFlrSF, y=SalePrice))+geom_jitter()

train.test.graph("X2ndFlrSF", dat)

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=X2ndFlrSF, y=SalePrice))+geom_jitter()

train.test.graph("LowQualFinSF", dat)

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=LowQualFinSF, y=SalePrice))+geom_jitter()

train.test.graph("GrLivArea", dat)

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=GrLivArea, y=SalePrice))+geom_jitter()

sf <- dat %>% filter(!is.na(SalePrice)) %>% select(X1stFlrSF, X2ndFlrSF, LowQualFinSF, GrLivArea, SalePrice)

cor(sf)
##                X1stFlrSF   X2ndFlrSF LowQualFinSF GrLivArea   SalePrice
## X1stFlrSF     1.00000000 -0.20264618  -0.01424067 0.5660240  0.60585218
## X2ndFlrSF    -0.20264618  1.00000000   0.06335295 0.6875011  0.31933380
## LowQualFinSF -0.01424067  0.06335295   1.00000000 0.1346828 -0.02560613
## GrLivArea     0.56602397  0.68750106   0.13468281 1.0000000  0.70862448
## SalePrice     0.60585218  0.31933380  -0.02560613 0.7086245  1.00000000
#different shape of desnity plot for Floor SF
#These variables might have impact on prediction 

Bath

dat %>% select(contains("Bath")) %>% colnames
## [1] "BsmtFullBath" "BsmtHalfBath" "FullBath"     "HalfBath"
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=BsmtFullBath, y=SalePrice))+
  geom_jitter()

train.test.graph("BsmtFullBath", dat)

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=BsmtHalfBath, y=SalePrice))+
  geom_jitter()

train.test.graph("BsmtHalfBath", dat)

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=FullBath, y=SalePrice))+
  geom_jitter()

table(dat$BsmtFullBath + dat$BsmtHalfBath)
## 
##    0    1    2    3 
## 1552 1304   60    3
dat %>% filter(!is.na(SalePrice)) %>% mutate(bsmtbath = BsmtFullBath+BsmtHalfBath) %>%
  ggplot(aes(x=bsmtbath, y=SalePrice))+
  geom_jitter()

#Creating BsmtBath
#if BsmtFullBath + BsmtHalfBath = 0, then the house doesn't ahve Basement Bathroom
#if BsmtFullBath + BsmtHalfBath != 0, otherwise
dat$BsmtBath <- ifelse(dat$BsmtFullBath + dat$BsmtHalfBath != 0, "BsmtBath", "no BsmtBath")
table(dat$BsmtBath)
## 
##    BsmtBath no BsmtBath 
##        1367        1552
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=BsmtBath, y=SalePrice, fill=BsmtBath)) + geom_boxplot()

#full bath
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=FullBath, y=SalePrice)) + geom_jitter()

train.test.graph("FullBath",dat)

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=HalfBath, y=SalePrice))+
  geom_jitter()

train.test.graph("HalfBath", dat)

#Creating Total Bath


dat %>% filter(!is.na(SalePrice)) %>% mutate(totalbath = FullBath+HalfBath) %>%
  ggplot(aes(x=totalbath, y=SalePrice))+
  geom_jitter()

#creating Bath variable
dat$Bath <- ifelse(dat$FullBath + dat$HalfBath>1, "more than 2 baths", "1 bath")
table(dat$Bath)
## 
##            1 bath more than 2 baths 
##               947              1972
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=Bath, y=SalePrice, fill=Bath)) + geom_boxplot()

#Creating TotalBath
dat$TotalBath <- dat$BsmtHalfBath*0.5 + dat$BsmtFullBath + dat$FullBath + dat$HalfBath*0.5

train.test.graph("TotalBath", dat) #different shape

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=TotalBath, y=SalePrice)) + geom_jitter()

bath <- dat %>% filter(!is.na(SalePrice)) %>% select(contains("bath"),SalePrice) %>% select_if(is.numeric)

cor(bath)
##              BsmtFullBath BsmtHalfBath    FullBath    HalfBath  TotalBath
## BsmtFullBath   1.00000000  -0.14787096 -0.06451205 -0.03090496 0.58307566
## BsmtHalfBath  -0.14787096   1.00000000 -0.05453581 -0.01233990 0.01209207
## FullBath      -0.06451205  -0.05453581  1.00000000  0.13638059 0.69419714
## HalfBath      -0.03090496  -0.01233990  0.13638059  1.00000000 0.39351601
## TotalBath      0.58307566   0.01209207  0.69419714  0.39351601 1.00000000
## SalePrice      0.22712223  -0.01684415  0.56066376  0.28410768 0.63173107
##                SalePrice
## BsmtFullBath  0.22712223
## BsmtHalfBath -0.01684415
## FullBath      0.56066376
## HalfBath      0.28410768
## TotalBath     0.63173107
## SalePrice     1.00000000
#removing TotalBath
dat <- dat %>% subset(select=-c(TotalBath))

Bedroom

#Bedroom: Bedrooms above grade (does NOT include basement bedrooms)
#TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)

dat %>% select(BedroomAbvGr, TotRmsAbvGrd) %>% summary
##   BedroomAbvGr   TotRmsAbvGrd   
##  Min.   :0.00   Min.   : 2.000  
##  1st Qu.:2.00   1st Qu.: 5.000  
##  Median :3.00   Median : 6.000  
##  Mean   :2.86   Mean   : 6.452  
##  3rd Qu.:3.00   3rd Qu.: 7.000  
##  Max.   :8.00   Max.   :15.000
table(dat$BedroomAbvGr)
## 
##    0    1    2    3    4    5    6    8 
##    8  103  742 1596  400   48   21    1
table(dat$TotRmsAbvGrd)
## 
##   2   3   4   5   6   7   8   9  10  11  12  13  14  15 
##   1  25 196 583 844 649 347 143  80  32  16   1   1   1
dat %>%
  ggplot(aes(x=BedroomAbvGr, y=TotRmsAbvGrd))+
  geom_jitter()

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=BedroomAbvGr, y=SalePrice))+
  geom_jitter()

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=TotRmsAbvGrd, y=SalePrice))+
  geom_jitter()

dat %>% filter(!is.na(SalePrice)) %>%
  mutate(totalrooms = BedroomAbvGr + TotRmsAbvGrd) %>%
  ggplot(aes(x=totalrooms, y=SalePrice))+
  geom_jitter()

dat <- dat %>% 
  mutate(totalrooms = BedroomAbvGr + TotRmsAbvGrd)

train.test.graph("totalrooms", dat)

#Quite different shape

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=totalrooms, y=SalePrice)) + geom_jitter()

rooms <- dat %>% filter(!is.na(SalePrice)) %>% select(BedroomAbvGr, TotRmsAbvGrd, totalrooms, SalePrice)

cor(rooms)
##              BedroomAbvGr TotRmsAbvGrd totalrooms SalePrice
## BedroomAbvGr    1.0000000    0.6766199  0.8480759 0.1682132
## TotRmsAbvGrd    0.6766199    1.0000000  0.9639891 0.5337232
## totalrooms      0.8480759    0.9639891  1.0000000 0.4448281
## SalePrice       0.1682132    0.5337232  0.4448281 1.0000000
#may not need totalrooms

dat <- dat %>% subset(select = -c(totalrooms))
#I would combine Floor SF / Bath / Bedrooms 
#These are all important to predict Sale Price

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=GrLivArea, y=SalePrice))+
  geom_jitter()

#Since these variables are highly correlated to each other
#some of these are removed because of multicollinearity
#I will combine these to not lose any information


#log transformation on GrLivArea to make variance similar
dat %>% filter(!is.na(SalePrice)) %>% mutate(GrLivArea = log(GrLivArea+1))%>%
  ggplot(aes(x=GrLivArea, y=SalePrice))+
  geom_jitter()

Foundation

dat %>% select(Foundation) %>% summary
##   Foundation  
##  BrkTil: 311  
##  CBlock:1235  
##  PConc :1308  
##  Slab  :  49  
##  Stone :  11  
##  Wood  :   5
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=Foundation, y=SalePrice, fill=Foundation))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(Foundation) %>%
  summarise(count = n(),
            mean = mean(SalePrice),
            median = median(SalePrice),
            sd = sd(SalePrice))
## # A tibble: 6 x 5
##   Foundation count    mean median     sd
##   <fct>      <int>   <dbl>  <dbl>  <dbl>
## 1 BrkTil       146 132291. 125250 54592.
## 2 CBlock       634 149806. 141500 48295.
## 3 PConc        647 225230. 205000 86866.
## 4 Slab          24 107366. 104150 34214.
## 5 Stone          6 165959. 126500 78558.
## 6 Wood           3 185667. 164000 56695.
dat %>% filter(is.na(SalePrice)) %>% group_by(Foundation) %>% tally()
## # A tibble: 6 x 2
##   Foundation     n
##   <fct>      <int>
## 1 BrkTil       165
## 2 CBlock       601
## 3 PConc        661
## 4 Slab          25
## 5 Stone          5
## 6 Wood           2
dat %>% filter(!is.na(SalePrice)) %>% group_by(Foundation) %>% tally()
## # A tibble: 6 x 2
##   Foundation     n
##   <fct>      <int>
## 1 BrkTil       146
## 2 CBlock       634
## 3 PConc        647
## 4 Slab          24
## 5 Stone          6
## 6 Wood           3
train.test.graph("Foundation", dat)

dat$Foundation.new <- ifelse(dat$Foundation == "PConc", "high", "low")
table(dat$Foundation.new)
## 
## high  low 
## 1308 1611
dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=Foundation.new, y=SalePrice, fill=Foundation.new)) + geom_boxplot()

Lot

dat %>% select(contains("Lot")) %>% summary
##   LotFrontage        LotArea       LotShape     LotConfig   
##  Min.   : 21.00   Min.   :  1300   IR1: 968   Corner : 511  
##  1st Qu.: 60.00   1st Qu.:  7478   IR2:  76   CulDSac: 176  
##  Median : 70.00   Median :  9453   IR3:  16   FR2    :  85  
##  Mean   : 70.06   Mean   : 10168   Reg:1859   FR3    :  14  
##  3rd Qu.: 80.00   3rd Qu.: 11570              Inside :2133  
##  Max.   :313.00   Max.   :215245
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=LotFrontage, y=SalePrice)) +
  geom_jitter()

dat %>% 
  ggplot(aes(x=LotFrontage)) +
  geom_density()

dat$LotFrontage[dat$LotFrontage > 200]
## [1] 313 313
#replace outlier
#dat$LotFrontage[dat$LotFrontage > 200] <- sample(180:200, 2)

train.test.graph("LotFrontage", dat %>% filter(LotFrontage < 300))

#LotArea
dat %>% ggplot(aes(x=LotArea))+
  geom_density()

#Heavily skewed distribution

dat %>% filter(!is.na(SalePrice) & LotArea < 60000) %>%
  ggplot(aes(x=LotArea, y=SalePrice))+
  geom_jitter()

which(dat$LotArea[!is.na(dat$SalePrice)] > 60000)
## [1]  250  314  336  452  707 1299
which(dat$LotArea[is.na(dat$SalePrice)] > 60000)
## integer(0)
train.test.graph("LotArea", dat %>% filter(LotArea < 60000))

dat %>% filter(!is.na(SalePrice)) %>% select(LotArea) %>% summary
##     LotArea      
##  Min.   :  1300  
##  1st Qu.:  7554  
##  Median :  9478  
##  Mean   : 10517  
##  3rd Qu.: 11602  
##  Max.   :215245
dat %>% filter(is.na(SalePrice)) %>% select(LotArea) %>% summary
##     LotArea     
##  Min.   : 1470  
##  1st Qu.: 7391  
##  Median : 9399  
##  Mean   : 9819  
##  3rd Qu.:11518  
##  Max.   :56600
dat %>% filter(!is.na(SalePrice) & LotArea > 60000)
##   MSSubClass MSZoning LotFrontage LotArea Street    Alley LotShape
## 1         50       RL    99.37358  159000   Pave no alley      IR2
## 2         20       RL   150.00000  215245   Pave no alley      IR3
## 3        190       RL   118.75789  164660   Grvl no alley      IR1
## 4         20       RL    62.00000   70761   Pave no alley      IR1
## 5         20       RL   100.53883  115149   Pave no alley      IR2
## 6         60       RL   313.00000   63887   Pave no alley      IR3
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Low    AllPub   CulDSac       Sev      ClearCr       Norm
## 2         Low    AllPub    Inside       Sev       Timber       Norm
## 3         HLS    AllPub    Corner       Sev       Timber       Norm
## 4         Low    AllPub    Inside       Mod      ClearCr       Norm
## 5         Low    AllPub   CulDSac       Sev      ClearCr       Norm
## 6         Bnk    AllPub    Corner       Gtl      Edwards      Feedr
##   Condition2 OverallQual OverallCond RoofStyle RoofMatl Exterior1st
## 1       Norm           6           7     Gable  CompShg     Wd Sdng
## 2       Norm           7           5       Hip  CompShg     BrkFace
## 3       Norm           5           6     Gable  CompShg     Plywood
## 4       Norm           7           5     Gable  WdShngl     Plywood
## 5       Norm           7           5     Gable  CompShg     Plywood
## 6       Norm          10           5       Hip  CompShg     HdBoard
##   Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1     HdBoard     BrkCmn        472         4         3     CBlock
## 2     Plywood       None          0         3         3     CBlock
## 3     Plywood       None          0         3         3     CBlock
## 4     Plywood       None          0         3         3     CBlock
## 5     Plywood      Stone        351         3         3     CBlock
## 6     HdBoard      Stone        796         5         3      PConc
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1        4        3           Gd          Rec        697          Unf
## 2        4        3           Gd          ALQ       1236          Rec
## 3        3        3           Gd          ALQ       1249          BLQ
## 4        4        3           Gd          ALQ        655          Unf
## 5        4        3           Gd          GLQ       1219          Unf
## 6        5        3           Gd          GLQ       5644          Unf
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1          0       747        1444    GasA         4          Y      SBrkr
## 2        820        80        2136    GasW         3          Y      SBrkr
## 3        147       103        1499    GasA         5          Y      SBrkr
## 4          0       878        1533    GasA         3          Y      SBrkr
## 5          0       424        1643    GasA         3          Y      SBrkr
## 6          0       466        6110    GasA         5          Y      SBrkr
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## 1      1444       700            0      2144            0            1
## 2      2036         0            0      2036            2            0
## 3      1619       167            0      1786            2            0
## 4      1533         0            0      1533            1            0
## 5      1824         0            0      1824            1            0
## 6      4692       950            0      5642            2            0
##   FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 1        2        0            4            1          Gd            7
## 2        2        0            3            1          TA            8
## 3        2        0            3            1          TA            7
## 4        2        0            2            1          Gd            5
## 5        2        0            2            1          Gd            5
## 6        2        1            3            1          Ex           12
##   Functional Fireplaces FireplaceQu GarageType GarageFinish GarageCars
## 1        Typ          2          TA     Attchd            4          2
## 2        Typ          2          Gd     Attchd            3          2
## 3        Typ          2          Gd     Attchd            4          2
## 4        Typ          2          TA     Attchd            2          2
## 5        Typ          2          TA     Attchd            2          2
## 6        Typ          3          Gd     Attchd            4          2
##   GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1        389          3          3          Y          0          98
## 2        513          3          3          Y          0           0
## 3        529          3          3          Y        670           0
## 4        576          3          3          Y        200          54
## 5        739          3          3          Y        380          48
## 6       1418          3          3          Y        214         292
##   EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC    Fence
## 1             0          0           0        0      0 no fence
## 2             0          0           0        0      0 no fence
## 3             0          0           0        0      0 no fence
## 4             0          0           0        0      0 no fence
## 5             0          0           0        0      0 no fence
## 6             0          0           0      480      4 no fence
##   MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1        Shed     500      6   2007       WD        Normal    277000
## 2        None       0      6   2009       WD        Normal    375000
## 3        Shed     700      8   2008       WD        Normal    228950
## 4        None       0     12   2006       WD        Normal    280000
## 5        None       0      6   2007       WD        Normal    302000
## 6        None       0      1   2008      New       Partial    160000
##   BldgType.new HouseStyle.new          Exterior isHeating GarageAge
## 1    high bldg      low house more option Exter      GasA        49
## 2    high bldg   medium house        same Exter     Other        44
## 3     low bldg      low house        same Exter      GasA        43
## 4    high bldg   medium house        same Exter      GasA        31
## 5    high bldg   medium house        same Exter      GasA        36
## 6    high bldg     high house        same Exter      GasA         0
##   GarageType.new noMiscFeature Porch  isPorch Age YearBuilt.deca
## 1              2          Misc    98    Porch   3           1950
## 2              2          None     0 no Porch  46           1960
## 3              2          Misc     0 no Porch  45           1960
## 4              2          None    54    Porch  33           1970
## 5              2          None    48    Porch   7           1970
## 6              2          None   292    Porch   2           2000
##   YearRemodAdd.deca Season BsmtBath              Bath Foundation.new
## 1              2000 Summer BsmtBath more than 2 baths            low
## 2              1960 Summer BsmtBath more than 2 baths            low
## 3              1960 Summer BsmtBath more than 2 baths            low
## 4              1970 Winter BsmtBath more than 2 baths            low
## 5              2000 Summer BsmtBath more than 2 baths            low
## 6              2000 Winter BsmtBath more than 2 baths           high
mean(dat$SalePrice, na.rm=TRUE)
## [1] 180921.2
#We might have to deal with this outliers


#LotShape

dat %>% filter(!is.na(SalePrice))%>%
  ggplot(aes(x=LotShape, y=SalePrice, fill=LotShape))+
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(LotShape) %>% summarise(count = n(),
                                   mean = mean(SalePrice),
                                   median = median(SalePrice),
                                   sd = sd(SalePrice))
## # A tibble: 4 x 5
##   LotShape count    mean median     sd
##   <fct>    <int>   <dbl>  <dbl>  <dbl>
## 1 IR1        484 206102. 189000 85858.
## 2 IR2         41 239833. 221000 99669.
## 3 IR3         10 216036. 203570 82540.
## 4 Reg        925 164755. 146000 69673.
#IR1 ~= IR2 ~= IR3 
#Irregular lotshape are more expensive 
#regular cheaper

dat %>% filter(is.na(SalePrice)) %>% group_by(LotShape) %>% tally()
## # A tibble: 4 x 2
##   LotShape     n
##   <fct>    <int>
## 1 IR1        484
## 2 IR2         35
## 3 IR3          6
## 4 Reg        934
dat %>% filter(!is.na(SalePrice)) %>% group_by(LotShape) %>% tally()
## # A tibble: 4 x 2
##   LotShape     n
##   <fct>    <int>
## 1 IR1        484
## 2 IR2         41
## 3 IR3         10
## 4 Reg        925
#combine IR <- IR1 , IR2, IR3

dat$LotShape <- as.character(dat$LotShape)
dat$LotShape[dat$LotShape == "IR1"] <- "IR"
dat$LotShape[dat$LotShape == "IR2"] <- "IR"
dat$LotShape[dat$LotShape == "IR3"] <- "IR"

table(dat$LotShape)
## 
##   IR  Reg 
## 1060 1859
train.test.graph("LotShape", dat)

#LotConfig

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=LotConfig, y=SalePrice, fill=LotConfig)) + 
  geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(LotConfig) %>% summarise(count = n(),
                                    mean = mean(SalePrice),
                                    median = median(SalePrice),
                                    sd = sd(SalePrice))
## # A tibble: 5 x 5
##   LotConfig count    mean  median     sd
##   <fct>     <int>   <dbl>   <dbl>  <dbl>
## 1 Corner      263 181623. 160000  84466.
## 2 CulDSac      94 223855. 199262  93118.
## 3 FR2          47 177935. 165000  62789.
## 4 FR3           4 208475  195450  78379.
## 5 Inside     1052 176938. 159698. 76427.
dat %>% filter(is.na(SalePrice)) %>% group_by(LotConfig) %>% tally()
## # A tibble: 5 x 2
##   LotConfig     n
##   <fct>     <int>
## 1 Corner      248
## 2 CulDSac      82
## 3 FR2          38
## 4 FR3          10
## 5 Inside     1081
dat %>% filter(!is.na(SalePrice)) %>% group_by(LotConfig) %>% tally()
## # A tibble: 5 x 2
##   LotConfig     n
##   <fct>     <int>
## 1 Corner      263
## 2 CulDSac      94
## 3 FR2          47
## 4 FR3           4
## 5 Inside     1052
dat$LotConfig[dat$LotConfig == "FR3"] <- "FR2"

train.test.graph("LotConfig",dat)

table(dat$LotConfig)
## 
##  Corner CulDSac     FR2     FR3  Inside 
##     511     176      99       0    2133

Kitchen

dat %>% select(contains("Kitchen")) %>% summary
##   KitchenAbvGr   KitchenQual
##  Min.   :0.000   Ex: 205    
##  1st Qu.:1.000   Fa:  70    
##  Median :1.000   Gd:1151    
##  Mean   :1.045   TA:1493    
##  3rd Qu.:1.000              
##  Max.   :3.000
dat %>% ggplot(aes(x=KitchenAbvGr)) + geom_density()

dat %>% ggplot(aes(x=as.factor(KitchenAbvGr), fill=as.factor(KitchenAbvGr))) + geom_bar()

table(dat$KitchenAbvGr)
## 
##    0    1    2    3 
##    3 2785  129    2
dat %>% filter(!is.na(SalePrice)) %>% ggplot(aes(x=KitchenAbvGr, y=SalePrice)) + geom_jitter()

dat %>% filter(!is.na(SalePrice)) %>% 
  group_by(KitchenAbvGr) %>% 
  summarise(count = n(),
            mean = mean(SalePrice),
            median = median(SalePrice),
            sd = sd(SalePrice))
## # A tibble: 4 x 5
##   KitchenAbvGr count    mean median     sd
##          <dbl> <int>   <dbl>  <dbl>  <dbl>
## 1            0     1 127500  127500    NA 
## 2            1  1392 183389. 165550 80265.
## 3            2    65 131096. 133900 31483.
## 4            3     2 109500  109500  4950.
dat %>% filter(is.na(SalePrice)) %>% 
  group_by(KitchenAbvGr) %>% tally()
## # A tibble: 3 x 2
##   KitchenAbvGr     n
##          <dbl> <int>
## 1            0     2
## 2            1  1393
## 3            2    64
train.test.graph("KitchenAbvGr", dat)

#Notice Test set doesn't have 3 kitchenAbvGr
#And 0 KitchenAbvGr is recorded only 1 or 2 in train and test dataset
#I will combine 
#0 -> 1
#3 -> 2

dat$KitchenAbvGr[dat$KitchenAbvGr == 0] <- 1
dat$KitchenAbvGr[dat$KitchenAbvGr == 3] <- 2

train.test.graph("KitchenAbvGr", dat)

dat$isKitchen <- ifelse(dat$KitchenAbvGr == 1, "1 kitchen", "2 kitchens")

table(dat$isKitchen)
## 
##  1 kitchen 2 kitchens 
##       2788        131
dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=isKitchen, y=SalePrice, fill=isKitchen)) + geom_boxplot()

dat <- dat%>% subset(select=-c(KitchenAbvGr))

#Too skewed


#kitchen qual
table(dat$KitchenQual)
## 
##   Ex   Fa   Gd   TA 
##  205   70 1151 1493
kit.qual <- c("Ex" = 5, 
          "Gd"= 4, 
          "TA" = 3, 
          "Fa" = 2, 
          "Po" = 1)


dat %>% ggplot(aes(x=KitchenQual, fill=KitchenQual)) + geom_bar()

dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=KitchenQual, y=SalePrice, fill=KitchenQual)) + geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>% 
  group_by(KitchenQual) %>% summarise(count = n(),
                                      mean = mean(SalePrice),
                                      median = median(SalePrice),
                                      sd = sd(SalePrice))
## # A tibble: 4 x 5
##   KitchenQual count    mean median      sd
##   <fct>       <int>   <dbl>  <dbl>   <dbl>
## 1 Ex            100 328555. 316750 120863.
## 2 Fa             39 105565. 115000  36004.
## 3 Gd            586 212116. 201400  64020.
## 4 TA            735 139963. 137000  38896.
dat %>% filter(is.na(SalePrice)) %>% group_by(KitchenQual) %>% tally()
## # A tibble: 4 x 2
##   KitchenQual     n
##   <fct>       <int>
## 1 Ex            105
## 2 Fa             31
## 3 Gd            565
## 4 TA            758
train.test.graph("KitchenQual", dat)

dat$KitchenQual <- ordTonum(dat$KitchenQual, kit.qual)
## The following `from` values were not present in `x`: Po

Fireplace

dat %>% select(contains("Fire")) %>% summary
##    Fireplaces           FireplaceQu  
##  Min.   :0.0000   Ex          :  43  
##  1st Qu.:0.0000   Fa          :  74  
##  Median :1.0000   Gd          : 744  
##  Mean   :0.5971   no fireplace:1420  
##  3rd Qu.:1.0000   Po          :  46  
##  Max.   :4.0000   TA          : 592
dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=as.factor(Fireplaces), y=SalePrice, fill=as.factor(Fireplaces))) + geom_boxplot()

table(dat$Fireplaces)
## 
##    0    1    2    3    4 
## 1420 1268  219   11    1
dat %>% mutate(Fireplaces = as.factor(Fireplaces)) %>% filter(!is.na(SalePrice)) %>% 
  group_by(Fireplaces) %>% summarise(count = n(),
                                     mean = mean(SalePrice),
                                     median = median(SalePrice),
                                     sd = sd(SalePrice))
## # A tibble: 4 x 5
##   Fireplaces count    mean median      sd
##   <fct>      <int>   <dbl>  <dbl>   <dbl>
## 1 0            690 141331. 135000  44390.
## 2 1            650 211844. 189975  79532.
## 3 2            115 240589. 206900 117452.
## 4 3              5 252000  205000  93314.
dat %>% mutate(Fireplaces = as.factor(Fireplaces)) %>% filter(is.na(SalePrice)) %>% 
  group_by(Fireplaces) %>% tally()
## # A tibble: 5 x 2
##   Fireplaces     n
##   <fct>      <int>
## 1 0            730
## 2 1            618
## 3 2            104
## 4 3              6
## 5 4              1
dat %>% mutate(Fireplaces = as.factor(Fireplaces)) %>% filter(!is.na(SalePrice)) %>% 
  group_by(Fireplaces) %>% tally()
## # A tibble: 4 x 2
##   Fireplaces     n
##   <fct>      <int>
## 1 0            690
## 2 1            650
## 3 2            115
## 4 3              5
train.test.graph("Fireplaces", dat)

#convert fireplaces -> factor

dat$isFireplaces <- as.factor(ifelse(dat$Fireplaces == 0, "no fireplace", "Fireplaces"))


dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=isFireplaces, y=SalePrice, fill=isFireplaces)) + geom_boxplot()

#FireplaceQu

dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=FireplaceQu, y=SalePrice, fill=FireplaceQu))+geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>% group_by(FireplaceQu) %>%
  summarise(count = n(),
            mean = mean(SalePrice),
            median = median(SalePrice),
            sd = sd(SalePrice))
## # A tibble: 6 x 5
##   FireplaceQu  count    mean median      sd
##   <fct>        <int>   <dbl>  <dbl>   <dbl>
## 1 Ex              24 337712. 314250 123611.
## 2 Fa              33 167298. 158000  34288.
## 3 Gd             380 226351. 206950  91123.
## 4 no fireplace   690 141331. 135000  44390.
## 5 Po              20 129764. 131500  31081.
## 6 TA             313 205723. 187500  71367.
dat %>% filter(is.na(SalePrice)) %>% group_by(FireplaceQu) %>% tally()
## # A tibble: 6 x 2
##   FireplaceQu      n
##   <fct>        <int>
## 1 Ex              19
## 2 Fa              41
## 3 Gd             364
## 4 no fireplace   730
## 5 Po              26
## 6 TA             279
dat %>% filter(!is.na(SalePrice)) %>% group_by(FireplaceQu) %>% tally()
## # A tibble: 6 x 2
##   FireplaceQu      n
##   <fct>        <int>
## 1 Ex              24
## 2 Fa              33
## 3 Gd             380
## 4 no fireplace   690
## 5 Po              20
## 6 TA             313
train.test.graph("FireplaceQu", dat)

table(dat$FireplaceQu)
## 
##           Ex           Fa           Gd no fireplace           Po 
##           43           74          744         1420           46 
##           TA 
##          592
fire.qual <- c("Ex" = 5, 
          "Gd"= 4, 
          "TA" = 3, 
          "Fa" = 2, 
          "Po" = 1, 
          "no fireplace" = 0)

table(ordTonum(dat$FireplaceQu, fire.qual))
## 
##    0    1    2    3    4    5 
## 1420   46   74  592  744   43
dat$FireplaceQu <- ordTonum(dat$FireplaceQu, fire.qual)

MSZoning

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=MSZoning, y=SalePrice, fill=MSZoning)) + geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(MSZoning) %>% summarise(count = n(),
                                   mean = mean(SalePrice),
                                   median = median(SalePrice),
                                   sd = sd(SalePrice))
## # A tibble: 5 x 5
##   MSZoning count    mean median     sd
##   <fct>    <int>   <dbl>  <dbl>  <dbl>
## 1 C (all)     10  74528   74700 33791.
## 2 FV          65 214014. 205950 52370.
## 3 RH          16 131558. 136500 35714.
## 4 RL        1151 191005. 174000 80766.
## 5 RM         218 126317. 120500 48522.
dat %>% filter(is.na(SalePrice)) %>%
  group_by(MSZoning) %>% tally()
## # A tibble: 5 x 2
##   MSZoning     n
##   <fct>    <int>
## 1 C (all)     15
## 2 FV          74
## 3 RH          10
## 4 RL        1116
## 5 RM         244
dat %>% filter(!is.na(SalePrice)) %>%
  group_by(MSZoning) %>% tally()
## # A tibble: 5 x 2
##   MSZoning     n
##   <fct>    <int>
## 1 C (all)     10
## 2 FV          65
## 3 RH          16
## 4 RL        1151
## 5 RM         218
#Combine factors

dat$MSZoning.new <- as.factor(ifelse(dat$MSZoning %in% c("FV", "RL"), "FV + RL", "Others"))

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=MSZoning.new, y=SalePrice, fill=MSZoning.new)) + geom_boxplot()

train.test.graph("MSZoning", dat)

table(dat$MSZoning)
## 
## C (all)      FV      RH      RL      RM 
##      25     139      26    2267     462
msz.qual <- c("FV" = 5,
              "RL" = 4,
              "RM" = 3,
              "RH" = 2,
              "C (all)" = 1)

table(ordTonum(dat$MSZoning, msz.qual))
## 
##    1    2    3    4    5 
##   25   26  462 2267  139
dat$MSZoning <- ordTonum(dat$MSZoning, msz.qual)

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=MSZoning, y=SalePrice)) + geom_jitter()

Neighborhood

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=Neighborhood, y=SalePrice, fill=Neighborhood))+
  geom_boxplot()+
  theme(axis.text.x = element_text(angle=90))

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(Neighborhood) %>% summarise(count = n(),
                                       mean = mean(SalePrice),
                                       median = median(SalePrice),
                                       sd = sd(SalePrice))
## # A tibble: 25 x 5
##    Neighborhood count    mean median     sd
##    <fct>        <int>   <dbl>  <dbl>  <dbl>
##  1 Blmngtn         17 194871. 191000 30393.
##  2 Blueste          2 137500  137500 19092.
##  3 BrDale          16 104494. 106000 14330.
##  4 BrkSide         58 124834. 124300 40349.
##  5 ClearCr         28 212565. 200250 50232.
##  6 CollgCr        150 197966. 197200 51404.
##  7 Crawfor         51 210625. 200624 68866.
##  8 Edwards        100 128220. 121750 43209.
##  9 Gilbert         79 192855. 181000 35987.
## 10 IDOTRR          37 100124. 103000 33377.
## # … with 15 more rows
dat %>% filter(!is.na(SalePrice)) %>%
  group_by(Neighborhood) %>%
  summarise(count = n(),
            mean = mean(SalePrice),
            median = median(SalePrice),
            sd = sd(SalePrice)) %>%
  ggplot(aes(x=reorder(Neighborhood, count), y=count, fill=Neighborhood))+
  geom_bar(stat="identity")+
  theme(axis.text.x = element_text(angle=90))+
  ggtitle("Neighborhood by Counts")

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(Neighborhood) %>%
  summarise(count = n(),
            mean = mean(SalePrice),
            median = median(SalePrice),
            sd = sd(SalePrice)) %>% 
  ggplot(aes(x=reorder(Neighborhood, mean), y=mean, fill=Neighborhood))+
  geom_bar(stat="identity")+
  theme(axis.text.x = element_text(angle=90))+
  ggtitle("SalePrice by group mean of Neighborhood")

dat %>% filter(is.na(SalePrice)) %>% group_by(Neighborhood) %>% tally()
## # A tibble: 25 x 2
##    Neighborhood     n
##    <fct>        <int>
##  1 Blmngtn         11
##  2 Blueste          8
##  3 BrDale          14
##  4 BrkSide         50
##  5 ClearCr         16
##  6 CollgCr        117
##  7 Crawfor         52
##  8 Edwards         94
##  9 Gilbert         86
## 10 IDOTRR          56
## # … with 15 more rows
dat %>% filter(!is.na(SalePrice)) %>% group_by(Neighborhood) %>% tally()
## # A tibble: 25 x 2
##    Neighborhood     n
##    <fct>        <int>
##  1 Blmngtn         17
##  2 Blueste          2
##  3 BrDale          16
##  4 BrkSide         58
##  5 ClearCr         28
##  6 CollgCr        150
##  7 Crawfor         51
##  8 Edwards        100
##  9 Gilbert         79
## 10 IDOTRR          37
## # … with 15 more rows
train.test.graph("Neighborhood", dat)

neigh <- data.frame(dat %>% filter(!is.na(SalePrice)) %>%
  group_by(Neighborhood) %>%
  summarise(count = n(),
            mean = mean(SalePrice),
            median = median(SalePrice),
            sd = sd(SalePrice)) %>% arrange(mean))

neigh <- neigh %>% mutate_if(is.factor, as.character)
neigh
##    Neighborhood count      mean median         sd
## 1       MeadowV    17  98576.47  88000  23491.050
## 2        IDOTRR    37 100123.78 103000  33376.710
## 3        BrDale    16 104493.75 106000  14330.176
## 4       BrkSide    58 124834.05 124300  40348.689
## 5       Edwards   100 128219.70 121750  43208.616
## 6       OldTown   113 128225.30 119000  52650.583
## 7        Sawyer    74 136793.14 135000  22345.129
## 8       Blueste     2 137500.00 137500  19091.883
## 9         SWISU    25 142591.36 139500  32622.918
## 10      NPkVill     9 142694.44 146000   9377.315
## 11        NAmes   225 145847.08 140000  33075.345
## 12      Mitchel    49 156270.12 153500  36486.625
## 13      SawyerW    59 186555.80 179900  55651.998
## 14       NWAmes    73 189050.07 182900  37172.218
## 15      Gilbert    79 192854.51 181000  35986.779
## 16      Blmngtn    17 194870.88 191000  30393.229
## 17      CollgCr   150 197965.77 197200  51403.666
## 18      Crawfor    51 210624.73 200624  68866.395
## 19      ClearCr    28 212565.43 200250  50231.539
## 20      Somerst    86 225379.84 225500  56177.556
## 21      Veenker    11 238772.73 218000  72369.318
## 22       Timber    38 242247.45 228475  64845.652
## 23      StoneBr    25 310499.00 278000 112969.677
## 24      NridgHt    77 316270.62 315000  96392.545
## 25      NoRidge    41 335295.32 301500 121412.659
neigh.1 <-neigh$Neighborhood[neigh$mean < 120000]
neigh.2 <- neigh$Neighborhood[neigh$mean > 120000 & neigh$mean < 180000]
neigh.3 <- neigh$Neighborhood[neigh$mean > 180000 & neigh$mean < 250000]
neigh.4 <- neigh$Neighborhood[neigh$mean > 250000]

dat$neigh.group <- 
  as.factor(ifelse(dat$Neighborhood %in% neigh.1, 1, 
                   ifelse(dat$Neighborhood %in% neigh.2, 2,
                          ifelse(dat$Neighborhood %in% neigh.3, 3,
                                 ifelse(dat$Neighborhood %in% neigh.4, 4, "")))))

table(dat$neigh.group)
## 
##    1    2    3    4 
##  160 1330 1141  288
dat %>% filter(!is.na(SalePrice)) %>% 
  ggplot(aes(x=neigh.group, y=SalePrice)) + geom_jitter()

OverallQual / OverallCond

dat %>% select(contains("Overall")) %>% summary
##   OverallQual      OverallCond   
##  Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 5.000   1st Qu.:5.000  
##  Median : 6.000   Median :5.000  
##  Mean   : 6.089   Mean   :5.565  
##  3rd Qu.: 7.000   3rd Qu.:6.000  
##  Max.   :10.000   Max.   :9.000
#OverallQual
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=as.factor(OverallQual), y=SalePrice, fill=as.factor(OverallQual))) + geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(OverallQual) %>% summarise(count = n(),
                                      mean = mean(SalePrice),
                                      median = median(SalePrice),
                                      sd = sd(SalePrice))
## # A tibble: 10 x 5
##    OverallQual count    mean median      sd
##          <dbl> <int>   <dbl>  <dbl>   <dbl>
##  1           1     2  50150   50150  15344.
##  2           2     3  51770.  60000  14254.
##  3           3    20  87474.  86250  24689.
##  4           4   116 108421. 108000  29022.
##  5           5   397 133523. 133000  27107.
##  6           6   374 161603. 160000  36090.
##  7           7   319 207716. 200141  44466.
##  8           8   168 274736. 269750  63899.
##  9           9    43 367513. 345000  81278.
## 10          10    18 438588. 432390 159785.
dat %>% filter(is.na(SalePrice)) %>%
  group_by(OverallQual) %>% tally()
## # A tibble: 10 x 2
##    OverallQual     n
##          <dbl> <int>
##  1           1     2
##  2           2    10
##  3           3    20
##  4           4   110
##  5           5   428
##  6           6   357
##  7           7   281
##  8           8   174
##  9           9    64
## 10          10    13
dat %>% filter(!is.na(SalePrice)) %>%
  group_by(OverallQual) %>% tally()
## # A tibble: 10 x 2
##    OverallQual     n
##          <dbl> <int>
##  1           1     2
##  2           2     3
##  3           3    20
##  4           4   116
##  5           5   397
##  6           6   374
##  7           7   319
##  8           8   168
##  9           9    43
## 10          10    18
#OverallCond

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=as.factor(OverallCond), y=SalePrice, fill=as.factor(OverallCond))) + geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(OverallCond) %>% summarise(count = n(),
                                      mean = mean(SalePrice),
                                      median = median(SalePrice),
                                      sd = sd(SalePrice))
## # A tibble: 9 x 5
##   OverallCond count    mean median      sd
##         <dbl> <int>   <dbl>  <dbl>   <dbl>
## 1           1     1  61000   61000     NA 
## 2           2     5 141986.  85000 141343.
## 3           3    25 101929.  89500  44852.
## 4           4    57 120438. 115000  38923.
## 5           5   821 203147. 185000  85117.
## 6           6   252 153962. 142750  60738.
## 7           7   205 158145. 145000  53468.
## 8           8    72 155652. 142500  52375.
## 9           9    22 216005. 176200  96486.
dat %>% filter(is.na(SalePrice)) %>%
  group_by(OverallCond) %>% tally()
## # A tibble: 9 x 2
##   OverallCond     n
##         <dbl> <int>
## 1           1     6
## 2           2     5
## 3           3    25
## 4           4    44
## 5           5   824
## 6           6   279
## 7           7   185
## 8           8    72
## 9           9    19
dat %>% filter(!is.na(SalePrice)) %>%
  group_by(OverallCond) %>% tally()
## # A tibble: 9 x 2
##   OverallCond     n
##         <dbl> <int>
## 1           1     1
## 2           2     5
## 3           3    25
## 4           4    57
## 5           5   821
## 6           6   252
## 7           7   205
## 8           8    72
## 9           9    22
dat$OverallCond %>% str
##  num [1:2919] 5 8 5 5 5 5 5 6 5 6 ...
train.test.graph("OverallCond", dat)

summary(dat$OverallCond)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   5.000   5.000   5.565   6.000   9.000

MasVnr

dat %>% select(contains("MasVnr")) %>% summary
##    MasVnrType     MasVnrArea    
##  BrkCmn :  26   Min.   :   0.0  
##  BrkFace: 879   1st Qu.:   0.0  
##  None   :1765   Median :   0.0  
##  Stone  : 249   Mean   : 101.4  
##                 3rd Qu.: 163.5  
##                 Max.   :1600.0
#MasVnrType
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=MasVnrType, y=SalePrice, fill=MasVnrType)) + geom_boxplot()

dat %>% filter(!is.na(SalePrice)) %>%
  group_by(MasVnrType) %>% summarise(count = n(),
                                     mean = mean(SalePrice),
                                     median = median(SalePrice),
                                     sd = sd(SalePrice))
## # A tibble: 4 x 5
##   MasVnrType count    mean median     sd
##   <fct>      <int>   <dbl>  <dbl>  <dbl>
## 1 BrkCmn        15 146318. 139000 46188.
## 2 BrkFace      445 204692. 181000 81214.
## 3 None         872 156958. 143125 61492.
## 4 Stone        128 265584. 246839 99940.
dat %>% filter(is.na(SalePrice)) %>%
  group_by(MasVnrType) %>% tally()
## # A tibble: 4 x 2
##   MasVnrType     n
##   <fct>      <int>
## 1 BrkCmn        11
## 2 BrkFace      434
## 3 None         893
## 4 Stone        121
dat %>% filter(!is.na(SalePrice)) %>%
  group_by(MasVnrType) %>% tally()
## # A tibble: 4 x 2
##   MasVnrType     n
##   <fct>      <int>
## 1 BrkCmn        15
## 2 BrkFace      445
## 3 None         872
## 4 Stone        128
train.test.graph("MasVnrType", dat)

table(dat$MasVnrType)
## 
##  BrkCmn BrkFace    None   Stone 
##      26     879    1765     249
dat$isMas <- as.factor(ifelse(dat$MasVnrType == "None", "no Mas", "Mas"))

table(dat$isMas)
## 
##    Mas no Mas 
##   1154   1765
dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=isMas, y=SalePrice, fill=isMas)) + geom_boxplot()

#MasVnrArea

dat %>% filter(!is.na(SalePrice)) %>%
  ggplot(aes(x=MasVnrArea, y=SalePrice)) + geom_jitter()

dat %>% ggplot(aes(x=MasVnrType, y=MasVnrArea, MasVnrType)) + geom_boxplot()

dat %>% filter(MasVnrType == "None" & MasVnrArea != 0)
##   MSSubClass MSZoning LotFrontage LotArea Street    Alley LotShape
## 1         60        4    80.00000   10400   Pave no alley      Reg
## 2         20        4    70.00000   10150   Pave no alley      Reg
## 3         90        4    90.76607   18890   Pave no alley       IR
## 4         60        4    72.86033   10762   Pave no alley       IR
## 5        160        3    24.00000    2368   Pave no alley      Reg
## 6         20        4   102.00000   13514   Pave no alley       IR
## 7         20        3    52.00000    8626   Pave no alley      Reg
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl       NWAmes       Norm
## 2         Lvl    AllPub    Inside       Gtl        NAmes      Feedr
## 3         Lvl    AllPub    Inside       Gtl       Sawyer      Feedr
## 4         Lvl    AllPub   CulDSac       Gtl      Gilbert       Norm
## 5         Lvl    AllPub    Inside       Gtl       BrDale       Norm
## 6         Lvl    AllPub    Corner       Gtl      NridgHt       Norm
## 7         Lvl    AllPub    Inside       Gtl      OldTown       Norm
##   Condition2 OverallQual OverallCond RoofStyle RoofMatl Exterior1st
## 1       Norm           6           5     Gable  CompShg     VinylSd
## 2       Norm           5           5     Gable  CompShg     Wd Sdng
## 3       RRAe           5           5      Shed  CompShg     Plywood
## 4       Norm           7           5     Gable  CompShg     VinylSd
## 5       Norm           5           6     Gable  CompShg     HdBoard
## 6       Norm           9           5       Hip  CompShg     VinylSd
## 7       Norm           4           6     Gable  CompShg     MetalSd
##   Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1     VinylSd       None        288         3         3     CBlock
## 2     Wd Sdng       None          1         3         3     CBlock
## 3     Plywood       None          1         3         3     CBlock
## 4     VinylSd       None        344         4         3      PConc
## 5     HdBoard       None        312         3         3     CBlock
## 6     VinylSd       None        285         5         3      PConc
## 7     MetalSd       None          1         3         3     CBlock
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1        3        3           No          Rec        247          Unf
## 2        3        3           No          Rec        456          Unf
## 3        4        3           No          GLQ        498          Rec
## 4        4        3           No          GLQ        694          Unf
## 5        3        3           No          LwQ        765          Unf
## 6        5        3           No          GLQ       1142          Unf
## 7        0        0  no basement  no basement          0  no basement
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1          0       485         732    GasA         4          Y      SBrkr
## 2          0       456         912    GasA         5          Y      FuseA
## 3        211       652        1361    GasA         5          Y      SBrkr
## 4          0       284         978    GasA         5          Y      SBrkr
## 5          0         0         765    GasA         3          Y      SBrkr
## 6          0       632        1774    GasA         5          Y      SBrkr
## 7          0         0           0    GasA         4          Y      SBrkr
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## 1      1012       778            0      1790            1            0
## 2       912         0            0       912            0            0
## 3      1361      1259            0      2620            0            0
## 4      1005       978            0      1983            0            0
## 5       765       600            0      1365            0            0
## 6      1808         0            0      1808            1            0
## 7       968         0            0       968            0            0
##   FullBath HalfBath BedroomAbvGr KitchenQual TotRmsAbvGrd Functional
## 1        1        2            4           3            8       Min2
## 2        1        0            2           3            5        Typ
## 3        2        2            4           3           12        Typ
## 4        2        1            3           4            9        Typ
## 5        1        1            3           3            7       Min1
## 6        2        0            3           5            7        Typ
## 7        1        0            2           3            5        Typ
##   Fireplaces FireplaceQu GarageType GarageFinish GarageCars GarageArea
## 1          1           3     Attchd            3          2        484
## 2          0           0     Attchd            3          1        275
## 3          1           3    BuiltIn            3          2        600
## 4          1           3     Attchd            4          2        490
## 5          0           0     Attchd            2          2        440
## 6          1           4     Attchd            4          3        850
## 7          0           0     Attchd            2          1        331
##   GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## 1          3          3          Y        148           0             0
## 2          3          3          Y          0           0             0
## 3          3          3          N        155          24           145
## 4          3          3          Y          0           0             0
## 5          3          3          Y          0          36             0
## 6          3          3          Y        200          26             0
## 7          2          3          Y          0           0             0
##   X3SsnPorch ScreenPorch PoolArea PoolQC    Fence MiscFeature MiscVal
## 1          0         147        0      0 no fence        None       0
## 2          0           0        0      0 no fence        None       0
## 3          0           0        0      0 no fence        Gar2    8300
## 4          0           0        0      0 no fence        None       0
## 5          0           0        0      0 no fence        None       0
## 6          0           0        0      0 no fence        None       0
## 7          0           0        0      0 no fence        None       0
##   MoSold YrSold SaleType SaleCondition SalePrice BldgType.new
## 1     11   2006       WD        Normal    165150    high bldg
## 2      7   2007      COD        Normal    114500    high bldg
## 3      8   2007       WD        Normal    190000     low bldg
## 4      5   2009       WD        Normal    225000    high bldg
## 5      5   2009       WD        Normal    125000    high bldg
## 6      3   2009       WD        Normal        NA    high bldg
## 7      5   2007       WD        Normal        NA    high bldg
##   HouseStyle.new   Exterior isHeating GarageAge GarageType.new
## 1     high house same Exter      GasA        34              2
## 2   medium house same Exter      GasA        49              2
## 3      low house same Exter      GasA        30              3
## 4     high house same Exter      GasA        10              2
## 5     high house same Exter      GasA        39              2
## 6   medium house same Exter      GasA         1              2
## 7   medium house same Exter      GasA        51              2
##   noMiscFeature Porch  isPorch Age YearBuilt.deca YearRemodAdd.deca Season
## 1          None   147    Porch  36           1970              1970   Fall
## 2          None     0 no Porch  51           1950              1950 Summer
## 3          Misc   169    Porch  32           1970              1970 Summer
## 4          None     0 no Porch  12           1990              1990 Spring
## 5          None    36    Porch  41           1970              1970 Spring
## 6          None    26    Porch   3           2000              2000 Spring
## 7          None     0 no Porch  53           1950              1950 Spring
##      BsmtBath              Bath Foundation.new  isKitchen isFireplaces
## 1    BsmtBath more than 2 baths            low  1 kitchen   Fireplaces
## 2 no BsmtBath            1 bath            low  1 kitchen no fireplace
## 3 no BsmtBath more than 2 baths            low 2 kitchens   Fireplaces
## 4 no BsmtBath more than 2 baths           high  1 kitchen   Fireplaces
## 5 no BsmtBath more than 2 baths            low  1 kitchen no fireplace
## 6    BsmtBath more than 2 baths           high  1 kitchen   Fireplaces
## 7 no BsmtBath            1 bath            low  1 kitchen no fireplace
##   MSZoning.new neigh.group  isMas
## 1      FV + RL           3 no Mas
## 2      FV + RL           2 no Mas
## 3      FV + RL           2 no Mas
## 4      FV + RL           3 no Mas
## 5       Others           1 no Mas
## 6      FV + RL           4 no Mas
## 7       Others           2 no Mas
#replace MasVnrArea in MasVnrType = "None" -> 0

dat$MasVnrArea[dat$MasVnrType == "None" & dat$MasVnrArea != 0] <- 0


#Again
dat %>% filter(!is.na(SalePrice) & MasVnrType != "None") %>%
  ggplot(aes(x=MasVnrArea, y=SalePrice)) + geom_jitter()

dat %>% filter(MasVnrType != "None") %>%
  ggplot(aes(x=MasVnrArea)) + geom_density(fill="blue")

#Skewed distribution

train.test.graph("MasVnrArea", dat)

dat %>% filter(MasVnrArea > 1200) %>% select(SalePrice, contains("Mas"))
##   SalePrice MasVnrType MasVnrArea isMas
## 1    239000    BrkFace       1600   Mas
## 2    625000    BrkFace       1378   Mas
## 3        NA    BrkFace       1290   Mas
## 4        NA      Stone       1224   Mas
## 5        NA    BrkFace       1224   Mas
#It might have to be replaced 
dat %>% filter(MasVnrType == "BrkFace") %>% select(SalePrice, MasVnrArea, MasVnrType) %>% summary
##    SalePrice        MasVnrArea       MasVnrType 
##  Min.   : 75000   Min.   :   0.0   BrkCmn :  0  
##  1st Qu.:149300   1st Qu.: 120.0   BrkFace:879  
##  Median :181000   Median : 203.0   None   :  0  
##  Mean   :204692   Mean   : 261.7   Stone  :  0  
##  3rd Qu.:236000   3rd Qu.: 340.0                
##  Max.   :755000   Max.   :1600.0                
##  NA's   :434

Numerical predictors (skewed) -> log / sqrt / normalization(scale) Transformation for Numerical variables —————

num.vars<- dat %>% select_if(is.numeric) %>% colnames
num.vars
##  [1] "MSZoning"       "LotFrontage"    "LotArea"        "OverallQual"   
##  [5] "OverallCond"    "MasVnrArea"     "ExterQual"      "ExterCond"     
##  [9] "BsmtQual"       "BsmtCond"       "BsmtFinSF1"     "BsmtFinSF2"    
## [13] "BsmtUnfSF"      "TotalBsmtSF"    "HeatingQC"      "X1stFlrSF"     
## [17] "X2ndFlrSF"      "LowQualFinSF"   "GrLivArea"      "BsmtFullBath"  
## [21] "BsmtHalfBath"   "FullBath"       "HalfBath"       "BedroomAbvGr"  
## [25] "KitchenQual"    "TotRmsAbvGrd"   "Fireplaces"     "FireplaceQu"   
## [29] "GarageFinish"   "GarageCars"     "GarageArea"     "GarageQual"    
## [33] "GarageCond"     "WoodDeckSF"     "OpenPorchSF"    "EnclosedPorch" 
## [37] "X3SsnPorch"     "ScreenPorch"    "PoolArea"       "PoolQC"        
## [41] "MiscVal"        "SalePrice"      "GarageAge"      "GarageType.new"
## [45] "Porch"          "Age"
#SalePrice
dat %>% 
  filter(!is.na(SalePrice)) %>% 
  ggplot(aes(sample = SalePrice)) + 
  stat_qq() + 
  stat_qq_line()+
  ggtitle("No Transformation")+
  theme(plot.title = element_text(hjust = 0.5))

dat %>% 
  filter(!is.na(SalePrice)) %>% mutate(SalePrice = log(SalePrice + 1)) %>%
  ggplot(aes(sample = SalePrice)) + 
  stat_qq() + 
  stat_qq_line()+
  ggtitle("Log Transformation")+
  theme(plot.title = element_text(hjust = 0.5))

dat %>% 
  filter(!is.na(SalePrice)) %>% mutate(SalePrice = scale(SalePrice, center=TRUE, scale=FALSE)) %>%
  ggplot(aes(sample = SalePrice)) + 
  stat_qq() + 
  stat_qq_line()+
  ggtitle("zscale Normalization")+
  theme(plot.title = element_text(hjust = 0.5))

dat %>% 
  filter(!is.na(SalePrice)) %>% mutate(SalePrice = sqrt(SalePrice)) %>%
  ggplot(aes(sample = SalePrice)) + 
  stat_qq() + 
  stat_qq_line()+
  ggtitle("Square Root Transformation")+
  theme(plot.title = element_text(hjust = 0.5))

#find best transformation method with skewness and kurtosis
#skewness and kurtosis cutoffs
#skewness -> +/- 1
#kurtosis -> +/- 3
#both closer to zero, more normalized ditributed


skewness(dat$SalePrice[!is.na(dat$SalePrice)])
## [1] 1.879009
dat %>% 
  filter(!is.na(SalePrice)) %>% mutate(SalePrice = sqrt(SalePrice)) %>%
  summarise(skew = skewness(SalePrice), kurt = kurtosis(SalePrice))
##        skew     kurt
## 1 0.9412156 1.940224
#Log transformation function
log.func <- function(var){
  return(log(var+1))
}

#Z-score transformation
scale.func <- function(var){
  return(scale(var, center=TRUE, scale=TRUE))
}

#Min-Max transformation
minmax.func <- function(var){
  return((var-min(var))/(max(var)-min(var)))
}


skew.kurt.testFunc <- function(var, data){
  
  test.var <- data[!is.na(data[,var]),var]
  
  skew.kurt.data <- data.frame(skew = skewness(test.var),
                               kurt = kurtosis(test.var))
  
  skew.kurt.data.log <- data.frame(skew = skewness(log.func(test.var)),
                               kurt = kurtosis(log.func(test.var)))
  
  skew.kurt.data.zscore <- data.frame(skew = skewness(scale.func(test.var)),
                               kurt = kurtosis(scale.func(test.var)))
  
  skew.kurt.data.minmax <- data.frame(skew = skewness(minmax.func(test.var)),
                               kurt = kurtosis(minmax.func(test.var)))
  
  skew.kurt.data.sqrt <- data.frame(skew = skewness(sqrt(test.var)),
                               kurt = kurtosis(sqrt(test.var)))
  
  skew.kurt.final <- rbind(skew.kurt.data,
                           skew.kurt.data.log,
                           skew.kurt.data.zscore,
                           skew.kurt.data.minmax,
                           skew.kurt.data.sqrt)
  
  skew.kurt.final$transformation <- c("original", "log", "zscore", "minmax", "sqrt")
  
  return(skew.kurt.final)
  
}


skew.kurt.testFunc("SalePrice", dat)
##        skew      kurt transformation
## 1 1.8790086 6.4967893       original
## 2 0.1210974 0.7974354            log
## 3 1.8790086 6.4967893         zscore
## 4 1.8790086 6.4967893         minmax
## 5 0.9412156 1.9402243           sqrt
#But, Let's try boxcox transformation for numerical variables


bc <- BoxCoxTrans(dat$SalePrice, na.rm=TRUE)
bc
## Box-Cox Transformation
## 
## 1460 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000 
## 
## Largest/Smallest: 21.6 
## Sample Skewness: 1.88 
## 
## Estimated Lambda: -0.1 
## With fudge factor, Lambda = 0 will be used for transformations
predict(bc,dat$SalePrice) %>% head
## [1] 12.24769 12.10901 12.31717 11.84940 12.42922 11.87060
bc.func <- function(vars,data){
  bc <- BoxCoxTrans(data[,vars], na.rm=TRUE)
  return(bc)
}

bc.func("SalePrice", dat)
## Box-Cox Transformation
## 
## 1460 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000 
## 
## Largest/Smallest: 21.6 
## Sample Skewness: 1.88 
## 
## Estimated Lambda: -0.1 
## With fudge factor, Lambda = 0 will be used for transformations
bc.func("SalePrice", dat)$lambda
## [1] -0.1
#returns the optimal lambda
#in this case, saleprice will be log transformed since lambda is -0.1





#creating new data to transform numerical variables
trans.dat <- dat

trans.dat %>% select_if(is.numeric) %>% colnames
##  [1] "MSZoning"       "LotFrontage"    "LotArea"        "OverallQual"   
##  [5] "OverallCond"    "MasVnrArea"     "ExterQual"      "ExterCond"     
##  [9] "BsmtQual"       "BsmtCond"       "BsmtFinSF1"     "BsmtFinSF2"    
## [13] "BsmtUnfSF"      "TotalBsmtSF"    "HeatingQC"      "X1stFlrSF"     
## [17] "X2ndFlrSF"      "LowQualFinSF"   "GrLivArea"      "BsmtFullBath"  
## [21] "BsmtHalfBath"   "FullBath"       "HalfBath"       "BedroomAbvGr"  
## [25] "KitchenQual"    "TotRmsAbvGrd"   "Fireplaces"     "FireplaceQu"   
## [29] "GarageFinish"   "GarageCars"     "GarageArea"     "GarageQual"    
## [33] "GarageCond"     "WoodDeckSF"     "OpenPorchSF"    "EnclosedPorch" 
## [37] "X3SsnPorch"     "ScreenPorch"    "PoolArea"       "PoolQC"        
## [41] "MiscVal"        "SalePrice"      "GarageAge"      "GarageType.new"
## [45] "Porch"          "Age"
#adding 1 for boxcox transformation (It contains log transforamtion)
trans.dat <- trans.dat %>% mutate_if(is.numeric, function(x){x+1})

num.vars<- trans.dat %>% select_if(is.numeric) %>% colnames
num.vars
##  [1] "MSZoning"       "LotFrontage"    "LotArea"        "OverallQual"   
##  [5] "OverallCond"    "MasVnrArea"     "ExterQual"      "ExterCond"     
##  [9] "BsmtQual"       "BsmtCond"       "BsmtFinSF1"     "BsmtFinSF2"    
## [13] "BsmtUnfSF"      "TotalBsmtSF"    "HeatingQC"      "X1stFlrSF"     
## [17] "X2ndFlrSF"      "LowQualFinSF"   "GrLivArea"      "BsmtFullBath"  
## [21] "BsmtHalfBath"   "FullBath"       "HalfBath"       "BedroomAbvGr"  
## [25] "KitchenQual"    "TotRmsAbvGrd"   "Fireplaces"     "FireplaceQu"   
## [29] "GarageFinish"   "GarageCars"     "GarageArea"     "GarageQual"    
## [33] "GarageCond"     "WoodDeckSF"     "OpenPorchSF"    "EnclosedPorch" 
## [37] "X3SsnPorch"     "ScreenPorch"    "PoolArea"       "PoolQC"        
## [41] "MiscVal"        "SalePrice"      "GarageAge"      "GarageType.new"
## [45] "Porch"          "Age"
trans.dat1 <- trans.dat
#boxcox transfomration applied
#some of variables are not able to be boxcox transformed since the estimated lambda not calculated
#those variables are transformed by log or normal transformation
for(i in num.vars){
  lambda<-bc.func(i, trans.dat)$lambda
  if(!is.na(lambda)){
    trans.dat[,i] <- predict(bc.func(i, trans.dat), trans.dat[,i])
    if(trans.dat[,i] == trans.dat1[,i]){
      print(paste0(i, " has not been trasnformed"))
    }
  }
  if(is.na(lambda)){
    print(i)
  }
}
## [1] "TotalBsmtSF has not been trasnformed"
## [1] "BedroomAbvGr has not been trasnformed"
## [1] "GarageArea has not been trasnformed"
skew.kurt.testFunc("TotalBsmtSF", trans.dat) #sqrt transformation will be the best
##        skew      kurt transformation
## 1  1.156300  9.096861       original
## 2 -4.782990 24.234139            log
## 3  1.156300  9.096861         zscore
## 4  1.156300  9.096861         minmax
## 5 -1.218461  5.189522           sqrt
skew.kurt.testFunc("BedroomAbvGr", trans.dat) #sqrt
##         skew     kurt transformation
## 1  0.3261567 1.932644       original
## 2 -0.6190533 2.212824            log
## 3  0.3261567 1.932644         zscore
## 4  0.3261567 1.932644         minmax
## 5 -0.2805640 1.787379           sqrt
skew.kurt.testFunc("GarageArea", trans.dat) #zscore
##         skew       kurt transformation
## 1  0.2414159  0.9345737       original
## 2 -3.4181129 11.0191213            log
## 3  0.2414159  0.9345737         zscore
## 4  0.2414159  0.9345737         minmax
## 5 -1.4383471  3.3023448           sqrt
#TotalBsmtSF
#BedroomAbvGr
#GarageArea

trans.dat <- trans.dat %>% mutate(TotalBsmtSF = sqrt(TotalBsmtSF),
                                  BedroomAbvGr = sqrt(BedroomAbvGr),
                                  GarageArea = scale.func(GarageArea))




trans.dat %>% select_if(is.numeric) %>% summary
##     MSZoning      LotFrontage        LotArea        OverallQual    
##  Min.   : 1.50   Min.   : 8.982   Min.   : 7.171   Min.   :0.8921  
##  1st Qu.:12.00   1st Qu.:17.969   1st Qu.: 8.920   1st Qu.:3.5788  
##  Median :12.00   Median :19.842   Median : 9.154   Median :4.1493  
##  Mean   :11.39   Mean   :19.599   Mean   : 9.095   Mean   :4.1752  
##  3rd Qu.:12.00   3rd Qu.:21.611   3rd Qu.: 9.356   3rd Qu.:4.6958  
##  Max.   :17.50   Max.   :50.815   Max.   :12.280   Max.   :6.2252  
##                                                                    
##   OverallCond       MasVnrArea      ExterQual        ExterCond     
##  Min.   :0.8284   Min.   :0.000   Min.   :0.5848   Min.   :0.6931  
##  1st Qu.:2.8990   1st Qu.:0.000   1st Qu.:0.6424   1st Qu.:1.3863  
##  Median :2.8990   Median :0.000   Median :0.6424   Median :1.3863  
##  Mean   :3.1059   Mean   :1.033   Mean   :0.6543   Mean   :1.4035  
##  3rd Qu.:3.2915   3rd Qu.:2.610   3rd Qu.:0.6743   3rd Qu.:1.3863  
##  Max.   :4.3246   Max.   :2.969   Max.   :0.6943   Max.   :1.7918  
##                                                                    
##     BsmtQual         BsmtCond        BsmtFinSF1       BsmtFinSF2    
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.: 7.500   1st Qu.: 7.500   1st Qu.: 0.000   1st Qu.:0.0000  
##  Median :12.000   Median : 7.500   Median :11.307   Median :0.0000  
##  Mean   : 9.938   Mean   : 7.352   Mean   : 8.539   Mean   :0.0792  
##  3rd Qu.:12.000   3rd Qu.: 7.500   3rd Qu.:13.712   3rd Qu.:0.0000  
##  Max.   :17.500   Max.   :12.000   Max.   :23.139   Max.   :0.6667  
##                                                                     
##    BsmtUnfSF      TotalBsmtSF      HeatingQC       X1stFlrSF    
##  Min.   : 0.00   Min.   : 1.00   Min.   : 4.00   Min.   :5.814  
##  1st Qu.:27.73   1st Qu.:28.18   1st Qu.: 7.50   1st Qu.:6.777  
##  Median :41.27   Median :31.46   Median :17.50   Median :6.987  
##  Mean   :40.74   Mean   :31.49   Mean   :13.23   Mean   :7.004  
##  3rd Qu.:54.78   3rd Qu.:36.10   3rd Qu.:17.50   3rd Qu.:7.236  
##  Max.   :94.69   Max.   :78.17   Max.   :17.50   Max.   :8.536  
##                                                                 
##    X2ndFlrSF      LowQualFinSF        GrLivArea      BsmtFullBath   
##  Min.   :0.000   Min.   :0.000000   Min.   :5.814   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.000000   1st Qu.:7.027   1st Qu.:0.0000  
##  Median :0.000   Median :0.000000   Median :7.276   Median :0.0000  
##  Mean   :2.830   Mean   :0.006851   Mean   :7.262   Mean   :0.1804  
##  3rd Qu.:6.558   3rd Qu.:0.000000   3rd Qu.:7.464   3rd Qu.:0.4310  
##  Max.   :7.633   Max.   :0.500000   Max.   :8.638   Max.   :0.5833  
##                                                                     
##   BsmtHalfBath        FullBath         HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:0.00000   1st Qu.:0.8284   1st Qu.:0.0000   1st Qu.:1.732  
##  Median :0.00000   Median :1.4641   Median :0.0000   Median :2.000  
##  Mean   :0.02258   Mean   :1.1862   Mean   :0.1400   Mean   :1.953  
##  3rd Qu.:0.00000   3rd Qu.:1.4641   3rd Qu.:0.3750   3rd Qu.:2.000  
##  Max.   :0.44444   Max.   :2.4721   Max.   :0.4444   Max.   :3.000  
##                                                                     
##   KitchenQual     TotRmsAbvGrd     Fireplaces      FireplaceQu    
##  Min.   :1.099   Min.   :1.099   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.386   1st Qu.:1.792   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.386   Median :1.946   Median :0.5858   Median :0.6931  
##  Mean   :1.496   Mean   :1.987   Mean   :0.3220   Mean   :0.7565  
##  3rd Qu.:1.609   3rd Qu.:2.079   3rd Qu.:0.5858   3rd Qu.:1.6094  
##  Max.   :1.792   Max.   :2.773   Max.   :1.1056   Max.   :1.7918  
##                                                                   
##   GarageFinish      GarageCars       GarageArea.V1      GarageQual    
##  Min.   :0.8595   Min.   :0.000   Min.   :-2.195601   Min.   : 0.000  
##  1st Qu.:1.5553   1st Qu.:1.081   1st Qu.:-0.709732   1st Qu.: 7.500  
##  Median :2.1623   Median :2.281   Median : 0.033203   Median : 7.500  
##  Mean   :1.9712   Mean   :2.020   Mean   : 0.000000   Mean   : 6.985  
##  3rd Qu.:2.1623   3rd Qu.:2.281   3rd Qu.: 0.478963   3rd Qu.: 7.500  
##  Max.   :2.7109   Max.   :6.322   Max.   : 4.713690   Max.   :17.500  
##                                                                       
##    GarageCond       WoodDeckSF     OpenPorchSF    EnclosedPorch   
##  Min.   : 0.000   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.: 7.500   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median : 7.500   Median :0.000   Median :3.296   Median :0.0000  
##  Mean   : 7.013   Mean   :2.450   Mean   :2.333   Mean   :0.1206  
##  3rd Qu.: 7.500   3rd Qu.:5.130   3rd Qu.:4.263   3rd Qu.:0.0000  
##  Max.   :17.500   Max.   :7.262   Max.   :6.611   Max.   :0.7691  
##                                                                   
##    X3SsnPorch        ScreenPorch         PoolArea       
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.00000   Median :0.000000  
##  Mean   :0.006337   Mean   :0.04385   Mean   :0.002227  
##  3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0.000000  
##  Max.   :0.499998   Max.   :0.50000   Max.   :0.499999  
##                                                         
##      PoolQC            MiscVal          SalePrice       GarageAge     
##  Min.   :0.000000   Min.   :0.00000   Min.   :10.46   Min.   : 0.000  
##  1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:11.78   1st Qu.: 2.887  
##  Median :0.000000   Median :0.00000   Median :12.00   Median : 6.005  
##  Mean   :0.001628   Mean   :0.01764   Mean   :12.02   Mean   : 5.333  
##  3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:12.27   3rd Qu.: 7.573  
##  Max.   :0.486111   Max.   :0.50000   Max.   :13.53   Max.   :10.505  
##                                       NA's   :1459                    
##  GarageType.new      Porch             Age       
##  Min.   :0.000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.:1.323   1st Qu.: 0.000   1st Qu.:2.379  
##  Median :3.219   Median : 5.977   Median :3.913  
##  Mean   :2.683   Mean   : 5.294   Mean   :4.020  
##  3rd Qu.:3.219   3rd Qu.: 8.376   3rd Qu.:5.753  
##  Max.   :5.621   Max.   :15.672   Max.   :6.451  
## 
#quick lm

lm1 <- lm(SalePrice ~.,
          data = trans.dat %>% filter(!is.na(SalePrice)))


par(mfrow=c(2,2))
plot(lm1)

par(mfrow=c(1,1))
#not plotting observations will have only one levels to not be able to compare with others obs

#index 524 and 1299 must be outliers

#There are several ways to see outliers in our model
outlierTest(lm1,n.max=50,cutoff=0.05)
##        rstudent unadjusted p-value Bonferonni p
## 1299 -13.546075         4.8414e-39   7.0200e-36
## 633   -6.780697         1.8607e-11   2.6980e-08
## 31    -6.372993         2.6247e-10   3.8058e-07
## 524   -6.244538         5.8620e-10   8.4999e-07
## 826    6.244538         5.8620e-10   8.4999e-07
## 463   -5.587255         2.8440e-08   4.1239e-05
## 969   -5.202013         2.3113e-07   3.3514e-04
## 811    5.133367         3.3105e-07   4.8002e-04
## 496   -5.084702         4.2598e-07   6.1767e-04
## 1325  -5.048986         5.1186e-07   7.4220e-04
## 1183   4.948320         8.5364e-07   1.2378e-03
## 589   -4.240207         2.4021e-05   3.4831e-02
influencePlot(lm1,id.method="identify",main="influential plot",sub="circle size is proportial to cook's distance")

##         StudRes       Hat      CookD
## 347         NaN 1.0000000        NaN
## 379         NaN 1.0000000        NaN
## 633   -6.780697 0.1160308 0.02405032
## 811    5.133367 0.9773506 4.60295959
## 1299 -13.546075 0.4262180 0.48984845
#However,
#I'm using Standardized Residuals to detect outliers

#from Springers Text book "Linear Regression"

#"In summary, an outlier is a point whose standardized residual falls outside the interval from -2 to 2. 
#Recall that a bad leverage point is a leverage point which is also an outlier. Thus, a bad leverage point is a leverage point whose standardized residual falls outside the interval from -2 to 2."

#However, I used -4 and 4 because of the size of dataset
o1<-which(rstandard(lm1, infl = lm.influence(lm1, do.coef = FALSE),
                    sd=sqrt(deviance(lm1)/df.residual(lm1)),
                    type=c("sd.1","predictive"))>4)

o2<-which(rstandard(lm1, infl = lm.influence(lm1, do.coef = FALSE),
                    sd=sqrt(deviance(lm1)/df.residual(lm1)),
                    type=c("sd.1","predictive"))<(-4))


outliers <- c(o1,o2)
length(outliers)
## [1] 14
outliers
##  682  811  826 1183   31  463  496  524  589  633  969 1299 1325 1454 
##  682  811  826 1183   31  463  496  524  589  633  969 1299 1325 1454
trans.dat <- trans.dat[-outliers,]

#outlier again
#quick lm

lm1 <- lm(SalePrice ~.,
          data = trans.dat %>% filter(!is.na(SalePrice)))


o1<-which(rstandard(lm1, infl = lm.influence(lm1, do.coef = FALSE),
                    sd=sqrt(deviance(lm1)/df.residual(lm1)),
                    type=c("sd.1","predictive"))>4)

o2<-which(rstandard(lm1, infl = lm.influence(lm1, do.coef = FALSE),
                    sd=sqrt(deviance(lm1)/df.residual(lm1)),
                    type=c("sd.1","predictive"))<(-4))


outliers <- c(o1,o2)
length(outliers)
## [1] 5
outliers
##  185  961  410  805 1420 
##  185  961  410  805 1420
trans.dat <- trans.dat[-outliers,]









trans.dat %>% select_if(is.numeric) %>% colnames
##  [1] "MSZoning"       "LotFrontage"    "LotArea"        "OverallQual"   
##  [5] "OverallCond"    "MasVnrArea"     "ExterQual"      "ExterCond"     
##  [9] "BsmtQual"       "BsmtCond"       "BsmtFinSF1"     "BsmtFinSF2"    
## [13] "BsmtUnfSF"      "TotalBsmtSF"    "HeatingQC"      "X1stFlrSF"     
## [17] "X2ndFlrSF"      "LowQualFinSF"   "GrLivArea"      "BsmtFullBath"  
## [21] "BsmtHalfBath"   "FullBath"       "HalfBath"       "BedroomAbvGr"  
## [25] "KitchenQual"    "TotRmsAbvGrd"   "Fireplaces"     "FireplaceQu"   
## [29] "GarageFinish"   "GarageCars"     "GarageArea"     "GarageQual"    
## [33] "GarageCond"     "WoodDeckSF"     "OpenPorchSF"    "EnclosedPorch" 
## [37] "X3SsnPorch"     "ScreenPorch"    "PoolArea"       "PoolQC"        
## [41] "MiscVal"        "SalePrice"      "GarageAge"      "GarageType.new"
## [45] "Porch"          "Age"
#After seeing some predictors if they have outliers in test set..
#totalbsmtSF

train.test.graph("LotFrontage", trans.dat)

trans.dat %>% filter(!is.na(SalePrice) & LotFrontage > 40) %>% dim
## [1]  1 95
trans.dat %>% filter(is.na(SalePrice) & LotFrontage > 40) %>% dim
## [1]  0 95
trans.dat$LotFrontage[trans.dat$LotFrontage > 40] <- 
  max(trans.dat$LotFrontage[is.na(trans.dat$SalePrice)])

train.test.graph("LotFrontage", trans.dat)

train.test.graph("LotArea", trans.dat)

#replace max value in training as max value in test set
trans.dat$LotArea[!is.na(trans.dat$SalePrice) & trans.dat$LotArea >max(trans.dat$LotArea[is.na(trans.dat$SalePrice)]) ] <- max(trans.dat$LotArea[is.na(trans.dat$SalePrice)])


train.test.graph("TotalBsmtSF", trans.dat)

trans.dat$TotalBsmtSF[is.na(trans.dat$SalePrice) & trans.dat$TotalBsmtSF>60]
## [1] 71.38627
#replace max value in test set as max value in training set
trans.dat$TotalBsmtSF[is.na(trans.dat$SalePrice) & trans.dat$TotalBsmtSF>60] <- max(trans.dat$TotalBsmtSF[!is.na(trans.dat$SalePrice)])


train.test.graph("TotalBsmtSF", trans.dat)

Factor variables

dat.prc <- trans.dat
dat.prc %>% str
## 'data.frame':    2900 obs. of  95 variables:
##  $ MSSubClass       : Factor w/ 16 levels "20","30","40",..: 6 1 6 7 6 5 1 6 5 16 ...
##  $ MSZoning         : num  12 12 12 12 12 12 12 12 7.5 12 ...
##  $ LotFrontage      : num  18.9 21.6 19.5 18 22.3 ...
##  $ LotArea          : num  9.04 9.17 9.33 9.16 9.57 ...
##  $ Street           : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Alley            : Factor w/ 3 levels "Gravel","no alley",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ LotShape         : chr  "Reg" "Reg" "IR" "IR" ...
##  $ LandContour      : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Utilities        : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
##  $ LotConfig        : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
##  $ LandSlope        : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Neighborhood     : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
##  $ Condition1       : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
##  $ Condition2       : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ OverallQual      : num  4.7 4.15 4.7 4.7 5.22 ...
##  $ OverallCond      : num  2.9 4 2.9 2.9 2.9 ...
##  $ RoofStyle        : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ RoofMatl         : Factor w/ 4 levels "CompShg","Tar&Grv",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Exterior1st      : Factor w/ 8 levels "BrkFace","CemntBd",..: 6 4 6 7 6 6 6 3 1 4 ...
##  $ Exterior2nd      : Factor w/ 6 levels "CmentBd","HdBoard",..: 5 3 5 3 5 5 5 2 3 3 ...
##  $ MasVnrType       : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 3 2 3 2 3 4 4 3 3 ...
##  $ MasVnrArea       : num  2.65 0 2.61 0 2.76 ...
##  $ ExterQual        : num  0.674 0.642 0.674 0.642 0.674 ...
##  $ ExterCond        : num  1.39 1.39 1.39 1.39 1.39 ...
##  $ Foundation       : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
##  $ BsmtQual         : num  12 12 12 7.5 12 12 17.5 12 7.5 7.5 ...
##  $ BsmtCond         : num  7.5 7.5 7.5 12 7.5 7.5 7.5 7.5 7.5 7.5 ...
##  $ BsmtExposure     : Factor w/ 5 levels "Av","Gd","Mn",..: 4 2 3 4 1 4 1 3 4 4 ...
##  $ BsmtFinType1     : Factor w/ 7 levels "ALQ","BLQ","GLQ",..: 3 1 3 1 3 3 3 1 7 3 ...
##  $ BsmtFinSF1       : num  13.57 14.82 12.24 9.66 13.3 ...
##  $ BsmtFinType2     : Factor w/ 7 levels "ALQ","BLQ","GLQ",..: 7 7 7 7 7 7 7 2 7 7 ...
##  $ BsmtFinSF2       : num  0 0 0 0 0 ...
##  $ BsmtUnfSF        : num  22.6 31.8 39.7 44.5 42.3 ...
##  $ TotalBsmtSF      : num  29.3 35.5 30.3 27.5 33.9 ...
##  $ Heating          : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC        : num  17.5 17.5 17.5 12 17.5 17.5 17.5 17.5 12 17.5 ...
##  $ CentralAir       : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Electrical       : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
##  $ X1stFlrSF        : num  6.75 7.14 6.83 6.87 7.04 ...
##  $ X2ndFlrSF        : num  6.75 0 6.77 6.63 6.96 ...
##  $ LowQualFinSF     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea        : num  7.44 7.14 7.49 7.45 7.7 ...
##  $ BsmtFullBath     : num  0.431 0 0.431 0.431 0.431 ...
##  $ BsmtHalfBath     : num  0 0.375 0 0 0 0 0 0 0 0 ...
##  $ FullBath         : num  1.464 1.464 1.464 0.828 1.464 ...
##  $ HalfBath         : num  0.375 0 0.375 0 0.375 0.375 0 0.375 0 0 ...
##  $ BedroomAbvGr     : num  2 2 2 2 2.24 ...
##  $ KitchenQual      : num  1.61 1.39 1.61 1.61 1.61 ...
##  $ TotRmsAbvGrd     : num  2.2 1.95 1.95 2.08 2.3 ...
##  $ Functional       : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
##  $ Fireplaces       : num  0 0.586 0.586 0.586 0.586 ...
##  $ FireplaceQu      : num  0 1.39 1.39 1.61 1.39 ...
##  $ GarageType       : Factor w/ 7 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
##  $ GarageFinish     : num  2.16 2.16 2.16 1.56 2.16 ...
##  $ GarageCars       : num  2.28 2.28 2.28 3.57 3.57 ...
##  $ GarageArea       : num [1:2900, 1] 0.3489 -0.0597 0.6276 0.7854 1.6862 ...
##  $ GarageQual       : num  7.5 7.5 7.5 7.5 7.5 7.5 7.5 7.5 4 12 ...
##  $ GarageCond       : num  7.5 7.5 7.5 7.5 7.5 7.5 7.5 7.5 7.5 7.5 ...
##  $ PavedDrive       : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
##  $ WoodDeckSF       : num  0 5.7 0 0 5.26 ...
##  $ OpenPorchSF      : num  4.13 0 3.76 3.58 4.44 ...
##  $ EnclosedPorch    : num  0 0 0 0.769 0 ...
##  $ X3SsnPorch       : num  0 0 0 0 0 ...
##  $ ScreenPorch      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fence            : Factor w/ 5 levels "GdPrv","GdWo",..: 5 5 5 5 5 3 5 5 5 5 ...
##  $ MiscFeature      : Factor w/ 5 levels "Gar2","None",..: 2 2 2 2 2 4 2 4 2 2 ...
##  $ MiscVal          : num  0 0 0 0 0 ...
##  $ MoSold           : Factor w/ 12 levels "1","2","3","4",..: 2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold           : Factor w/ 5 levels "2006","2007",..: 3 2 3 1 3 4 2 4 3 3 ...
##  $ SaleType         : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ SaleCondition    : Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
##  $ SalePrice        : num  12.2 12.1 12.3 11.8 12.4 ...
##  $ BldgType.new     : Factor w/ 2 levels "high bldg","low bldg": 1 1 1 1 1 1 1 1 1 2 ...
##  $ HouseStyle.new   : Factor w/ 3 levels "high house","low house",..: 1 3 1 1 1 2 3 1 2 2 ...
##  $ Exterior         : Factor w/ 2 levels "more option Exter",..: 2 2 2 1 2 2 2 2 1 2 ...
##  $ isHeating        : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ GarageAge        : num  2.37 6.09 2.89 3.11 3.11 ...
##  $ GarageType.new   : num  3.22 3.22 3.22 1.32 3.22 ...
##  $ noMiscFeature    : chr  "None" "None" "None" "None" ...
##  $ Porch            : num  6.41 0 5.61 10.73 7.16 ...
##  $ isPorch          : chr  "Porch" "no Porch" "Porch" "Porch" ...
##  $ Age              : num  2.58 5.12 2.76 5.4 3.08 ...
##  $ YearBuilt.deca   : Factor w/ 15 levels "1870","1880",..: 14 11 14 5 14 13 14 11 7 7 ...
##  $ YearRemodAdd.deca: Factor w/ 7 levels "1950","1960",..: 6 3 6 3 6 5 6 3 1 1 ...
##  $ Season           : chr  "Winter" "Spring" "Fall" "Winter" ...
##  $ BsmtBath         : chr  "BsmtBath" "BsmtBath" "BsmtBath" "BsmtBath" ...
##  $ Bath             : chr  "more than 2 baths" "more than 2 baths" "more than 2 baths" "1 bath" ...
##  $ Foundation.new   : chr  "high" "low" "high" "low" ...
##  $ isKitchen        : chr  "1 kitchen" "1 kitchen" "1 kitchen" "1 kitchen" ...
##  $ isFireplaces     : Factor w/ 2 levels "Fireplaces","no fireplace": 2 1 1 1 1 2 1 1 1 1 ...
##  $ MSZoning.new     : Factor w/ 2 levels "FV + RL","Others": 1 1 1 1 1 1 1 1 2 1 ...
##  $ neigh.group      : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 4 2 3 3 2 2 ...
##  $ isMas            : Factor w/ 2 levels "Mas","no Mas": 1 2 1 2 1 2 1 1 2 2 ...
#Factor Variables mostly skewed

fac.vars <- trans.dat %>% select_if(is.factor) %>% colnames

not.imp.fac <- NULL
for(i in fac.vars){
  if(any(prop.table(matrix(table(trans.dat[,i]))) > 0.95)){
    not.imp.fac <- c(not.imp.fac, i)
  }
}


#Let's remove insignificant or independent factor predictors with response
not.imp.fac
## [1] "Street"      "Utilities"   "LandSlope"   "Condition2"  "RoofMatl"   
## [6] "MiscFeature"
dat.prc <- trans.dat[,!colnames(trans.dat) %in% not.imp.fac]



trans.dat <- dat.prc

trans.dat %>% dim
## [1] 2900   89

Categorical predictors -> dummy variables , Dummy - one hot encoding

#Splitting numerical variable and factor variables to make dummy variables
num.trans.dat <- trans.dat %>% select_if(is.numeric)
fac.trans.dat <- trans.dat %>% select_if(is.factor)


fac.var <- fac.trans.dat %>% colnames

#creating formula
formula.dummy <- as.formula(paste("~", paste(fac.var, collapse = "+")))

dummies <- dummyVars(formula.dummy, data=fac.trans.dat)

dummies.pred <- predict(dummies, fac.trans.dat)

fac.trans.dummies.dat <- cbind(fac.trans.dat, dummies.pred)

fac.trans.dummies.dat <- fac.trans.dummies.dat[,!colnames(fac.trans.dummies.dat) %in% fac.var]

dim(fac.trans.dummies.dat)
## [1] 2900  211
#If column sum is 0, then either train or test set doesn't have a value
which(colSums(fac.trans.dummies.dat[!is.na(trans.dat$SalePrice),])==0)
## MSSubClass.150  LotConfig.FR3 
##             13             27
fac.trans.dummies.dat <- fac.trans.dummies.dat[,which(colSums(fac.trans.dummies.dat[!is.na(trans.dat$SalePrice),])!=0)]

which(colSums(fac.trans.dummies.dat[is.na(trans.dat$SalePrice),])==0)
## Electrical.Mix 
##            115
fac.trans.dummies.dat <- fac.trans.dummies.dat[,which(colSums(fac.trans.dummies.dat[is.na(trans.dat$SalePrice),])!=0)]

fac.trans.dummies.dat %>% dim
## [1] 2900  208
#Combine
comb.trans.dat <- cbind(num.trans.dat, fac.trans.dummies.dat)

comb.trans.dat %>% dim
## [1] 2900  254
training <- comb.trans.dat %>% filter(!is.na(SalePrice))
testing <- comb.trans.dat %>% filter(is.na(SalePrice))

training %>% dim
## [1] 1441  254
testing %>% dim
## [1] 1459  254
which(colSums(training) ==0)
## named integer(0)
which(colSums(testing) ==0)
## named integer(0)

Caret - Cross Validation, Creating useful function for modeling

#creating function for Caret modeling

model <- function(method, training, control,grid,...){

  if(is.null(grid)){
    model.fit <- train(SalePrice~.,
                     data = training,
                     method = method,
                     trControl = control,
                     ...)
    return(model.fit)
  }

  else{
    model.fit <- train(SalePrice~.,
                     data = training,
                     method = method,
                     trControl = control,
                     tuneGrid = grid,
                     ...)
    return(model.fit)
  }
}


#10 folds cv
control <- trainControl(method = "cv", number = 10)

Modeling and Bootstrapping for final prediction and averaging them (Soft Voting)

training %>% dim
## [1] 1441  254
#I will use Ridge / Lasso / Elastic Net / XGBoost
#creating functions to fit the each algorithms

lambda <- seq(0.001,0.1,by = 0.0001)

ridge.func <- function(train, test){
  ridgeGrid <- expand.grid(alpha = 0, lambda = lambda) #ridge: alpha = 0
  ridge.model <- model("glmnet", train, control, ridgeGrid)
  ridge.final.pred <- exp(predict(ridge.model, test))
  return(ridge.final.pred)
}

lasso.func <- function(train,test){
  lassoGrid <- expand.grid(alpha = 1, lambda = lambda)
  lasso.model <- model("glmnet", train, control, lassoGrid)
  lasso.final.pred <- exp(predict(lasso.model,test))
  return(lasso.final.pred)
}

elastic.func <- function(train,test){
  elastic.model <- model("glmnet", training, control, grid=NULL, tuneLength = 10)
  elastic.final.pred <- exp(predict(elastic.model, testing))
  return(elastic.final.pred)
}





control <- trainControl(method = "cv", number = 10)

#Grid Search
xgb.grid <- expand.grid(nrounds = 1000, #boosting iterations
                        eta = c(0.01, 0.1, 0.4), #Shrinkage
                        max_depth = c(1,2,3), #max tree depth
                        gamma = c(0,0.01, 0.1), #minimum loss reduction
                        colsample_bytree = c(0.5, 1), #subsample ratio of columns
                        min_child_weight = c(1, 3), #minimum sum of instance weight
                        subsample = c(0.5, 0.8)) #subsample percentage

#This takes long time, so I will skip this process
#xgb.model <- model("xgbTree", training, control, grid = xgb.grid)

#xgb.model

#xgb.model$bestTune

#min(xgb.model$results$RMSE)
#As the result of cv score, the parameter has been set as the below


label_train <- training$SalePrice
# put our testing & training data into two seperates Dmatrixs objects
dtrain <- xgb.DMatrix(data = as.matrix(training %>% subset(select=-c(SalePrice))), label= label_train)
dtest <- xgb.DMatrix(data = as.matrix(testing %>% subset(select = -c(SalePrice))))

#Parameter from CV above
default_param<-list(
  objective = "reg:linear",
  booster = "gbtree",
        eta=0.1, #default = 0.3
        gamma=0,
        max_depth=2, #default=6
        min_child_weight=1, #default=1
        subsample=0.8,
        colsample_bytree=0.5)


xgb.func <- function(train, test){
  xgbcv <- xgb.cv(params = default_param, 
                 data = train, nrounds = 1000, 
                 nfold = 10, 
                 showsd = T, 
                 stratified = T, 
                 print_every_n = 40, 
                 early_stopping_rounds = 50, 
                 maximize = F)
  xgb_mod <- xgb.train(data = train, params=default_param, nrounds = xgbcv$best_iteration)

  xgb.final.pred <- exp(predict(xgb_mod, test))
}



#Creating dataset to average the predicted values
#I will perform each algorithms 10 times and average the results
boot.dat <- as.data.frame(matrix(data = NA, 
                                   nrow=length(testing$SalePrice),
                                   ncol=10, 
                                   dimnames = list(c(1:length(testing$SalePrice)),c(1:10))))

for(i in 1:10){
  ridge <- ridge.func(training, testing)
  lasso <- lasso.func(training, testing)
  elastic <- elastic.func(training, testing)
  xgb <- xgb.func(dtrain, dtest)
  
  softVoting <- (ridge+lasso+elastic+xgb)/4
  
  boot.dat[,i] <- softVoting
}
## [1]  train-rmse:10.384236+0.003143   test-rmse:10.384182+0.032328 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## [41] train-rmse:0.205466+0.001355    test-rmse:0.213161+0.011837 
## [81] train-rmse:0.100826+0.001624    test-rmse:0.118548+0.003974 
## [121]    train-rmse:0.089879+0.001049    test-rmse:0.110303+0.002388 
## [161]    train-rmse:0.083567+0.000777    test-rmse:0.106588+0.002584 
## [201]    train-rmse:0.078953+0.000593    test-rmse:0.104449+0.002919 
## [241]    train-rmse:0.075205+0.000572    test-rmse:0.102858+0.002989 
## [281]    train-rmse:0.072058+0.000564    test-rmse:0.101888+0.003262 
## [321]    train-rmse:0.069312+0.000628    test-rmse:0.101193+0.003443 
## [361]    train-rmse:0.066861+0.000604    test-rmse:0.100782+0.003495 
## [401]    train-rmse:0.064595+0.000569    test-rmse:0.100477+0.003514 
## [441]    train-rmse:0.062610+0.000537    test-rmse:0.100026+0.003375 
## [481]    train-rmse:0.060716+0.000511    test-rmse:0.099681+0.003486 
## [521]    train-rmse:0.058905+0.000531    test-rmse:0.099666+0.003588 
## [561]    train-rmse:0.057263+0.000551    test-rmse:0.099352+0.003448 
## [601]    train-rmse:0.055634+0.000502    test-rmse:0.099354+0.003688 
## [641]    train-rmse:0.054142+0.000458    test-rmse:0.099346+0.003704 
## [681]    train-rmse:0.052747+0.000455    test-rmse:0.099280+0.003684 
## [721]    train-rmse:0.051409+0.000483    test-rmse:0.099220+0.003736 
## [761]    train-rmse:0.050110+0.000502    test-rmse:0.099106+0.003679 
## [801]    train-rmse:0.048900+0.000477    test-rmse:0.099114+0.003727 
## [841]    train-rmse:0.047729+0.000513    test-rmse:0.099185+0.003707 
## Stopping. Best iteration:
## [820]    train-rmse:0.048341+0.000484    test-rmse:0.099071+0.003708
## 
## [1]  train-rmse:10.384242+0.003190   test-rmse:10.384178+0.033978 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## [41] train-rmse:0.204587+0.001136    test-rmse:0.212765+0.012608 
## [81] train-rmse:0.100456+0.002061    test-rmse:0.117823+0.010624 
## [121]    train-rmse:0.089611+0.001614    test-rmse:0.110413+0.010593 
## [161]    train-rmse:0.083273+0.001345    test-rmse:0.106997+0.010502 
## [201]    train-rmse:0.078645+0.001218    test-rmse:0.105266+0.010054 
## [241]    train-rmse:0.075053+0.001104    test-rmse:0.103965+0.010103 
## [281]    train-rmse:0.071903+0.001087    test-rmse:0.103041+0.010058 
## [321]    train-rmse:0.069129+0.001051    test-rmse:0.102305+0.010460 
## [361]    train-rmse:0.066696+0.001050    test-rmse:0.102134+0.010556 
## [401]    train-rmse:0.064430+0.000968    test-rmse:0.101550+0.010506 
## [441]    train-rmse:0.062375+0.000982    test-rmse:0.101330+0.010417 
## [481]    train-rmse:0.060471+0.000991    test-rmse:0.101027+0.010316 
## [521]    train-rmse:0.058680+0.000956    test-rmse:0.100868+0.010532 
## [561]    train-rmse:0.057017+0.000939    test-rmse:0.100771+0.010687 
## [601]    train-rmse:0.055395+0.000956    test-rmse:0.100594+0.010810 
## [641]    train-rmse:0.053900+0.000894    test-rmse:0.100485+0.010755 
## [681]    train-rmse:0.052480+0.000855    test-rmse:0.100367+0.010847 
## [721]    train-rmse:0.051107+0.000829    test-rmse:0.100343+0.010811 
## [761]    train-rmse:0.049830+0.000768    test-rmse:0.100310+0.010823 
## [801]    train-rmse:0.048569+0.000747    test-rmse:0.100361+0.010685 
## Stopping. Best iteration:
## [781]    train-rmse:0.049170+0.000764    test-rmse:0.100256+0.010783
## 
## [1]  train-rmse:10.384455+0.002890   test-rmse:10.384409+0.031075 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## [41] train-rmse:0.204991+0.000890    test-rmse:0.212347+0.015601 
## [81] train-rmse:0.100720+0.001310    test-rmse:0.117781+0.009149 
## [121]    train-rmse:0.089762+0.001253    test-rmse:0.109789+0.007691 
## [161]    train-rmse:0.083325+0.000950    test-rmse:0.106130+0.007067 
## [201]    train-rmse:0.078707+0.000851    test-rmse:0.103994+0.006322 
## [241]    train-rmse:0.075070+0.000739    test-rmse:0.102977+0.005995 
## [281]    train-rmse:0.071913+0.000632    test-rmse:0.102067+0.005882 
## [321]    train-rmse:0.069192+0.000621    test-rmse:0.101472+0.005445 
## [361]    train-rmse:0.066685+0.000629    test-rmse:0.100864+0.005520 
## [401]    train-rmse:0.064408+0.000604    test-rmse:0.100367+0.005632 
## [441]    train-rmse:0.062318+0.000610    test-rmse:0.099901+0.005530 
## [481]    train-rmse:0.060346+0.000633    test-rmse:0.099781+0.005324 
## [521]    train-rmse:0.058506+0.000610    test-rmse:0.099817+0.005101 
## [561]    train-rmse:0.056828+0.000645    test-rmse:0.099596+0.005179 
## [601]    train-rmse:0.055244+0.000608    test-rmse:0.099432+0.005318 
## [641]    train-rmse:0.053768+0.000653    test-rmse:0.099347+0.005307 
## Stopping. Best iteration:
## [621]    train-rmse:0.054508+0.000649    test-rmse:0.099275+0.005300
## 
## [1]  train-rmse:10.384518+0.001656   test-rmse:10.384514+0.018327 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## [41] train-rmse:0.204141+0.000780    test-rmse:0.210337+0.010671 
## [81] train-rmse:0.099900+0.000980    test-rmse:0.116184+0.008119 
## [121]    train-rmse:0.089191+0.000887    test-rmse:0.108988+0.007603 
## [161]    train-rmse:0.082761+0.000878    test-rmse:0.105510+0.007215 
## [201]    train-rmse:0.078164+0.000921    test-rmse:0.103568+0.006971 
## [241]    train-rmse:0.074584+0.000909    test-rmse:0.102622+0.006948 
## [281]    train-rmse:0.071490+0.000895    test-rmse:0.101558+0.006658 
## [321]    train-rmse:0.068756+0.000856    test-rmse:0.101019+0.006644 
## [361]    train-rmse:0.066308+0.000841    test-rmse:0.100574+0.006753 
## [401]    train-rmse:0.064157+0.000836    test-rmse:0.100220+0.006863 
## [441]    train-rmse:0.062098+0.000863    test-rmse:0.099952+0.006773 
## [481]    train-rmse:0.060238+0.000829    test-rmse:0.099949+0.006648 
## [521]    train-rmse:0.058453+0.000836    test-rmse:0.099759+0.006526 
## [561]    train-rmse:0.056808+0.000801    test-rmse:0.099797+0.006599 
## Stopping. Best iteration:
## [514]    train-rmse:0.058779+0.000818    test-rmse:0.099704+0.006523
## 
## [1]  train-rmse:10.384420+0.003623   test-rmse:10.384358+0.033972 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## [41] train-rmse:0.204698+0.001255    test-rmse:0.211661+0.012184 
## [81] train-rmse:0.100211+0.001347    test-rmse:0.116788+0.005316 
## [121]    train-rmse:0.089396+0.000970    test-rmse:0.109501+0.005314 
## [161]    train-rmse:0.082992+0.000617    test-rmse:0.106006+0.005310 
## [201]    train-rmse:0.078442+0.000380    test-rmse:0.103844+0.005116 
## [241]    train-rmse:0.074858+0.000401    test-rmse:0.102883+0.005116 
## [281]    train-rmse:0.071785+0.000412    test-rmse:0.101987+0.004952 
## [321]    train-rmse:0.069082+0.000357    test-rmse:0.101525+0.005017 
## [361]    train-rmse:0.066641+0.000360    test-rmse:0.101040+0.004773 
## [401]    train-rmse:0.064337+0.000322    test-rmse:0.100668+0.004767 
## [441]    train-rmse:0.062226+0.000362    test-rmse:0.100283+0.004677 
## [481]    train-rmse:0.060334+0.000337    test-rmse:0.100266+0.004464 
## Stopping. Best iteration:
## [449]    train-rmse:0.061839+0.000366    test-rmse:0.100205+0.004570
## 
## [1]  train-rmse:10.384332+0.001989   test-rmse:10.384321+0.022968 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## [41] train-rmse:0.204803+0.000989    test-rmse:0.211856+0.013986 
## [81] train-rmse:0.100139+0.000893    test-rmse:0.116000+0.007644 
## [121]    train-rmse:0.089301+0.000794    test-rmse:0.108469+0.007534 
## [161]    train-rmse:0.082873+0.000741    test-rmse:0.104816+0.007025 
## [201]    train-rmse:0.078325+0.000822    test-rmse:0.102844+0.006675 
## [241]    train-rmse:0.074646+0.000810    test-rmse:0.101514+0.006592 
## [281]    train-rmse:0.071647+0.000812    test-rmse:0.100678+0.006527 
## [321]    train-rmse:0.068943+0.000828    test-rmse:0.099877+0.006740 
## [361]    train-rmse:0.066501+0.000828    test-rmse:0.099243+0.006839 
## [401]    train-rmse:0.064264+0.000724    test-rmse:0.098658+0.006834 
## [441]    train-rmse:0.062209+0.000651    test-rmse:0.098530+0.006895 
## [481]    train-rmse:0.060276+0.000624    test-rmse:0.098161+0.006759 
## [521]    train-rmse:0.058511+0.000627    test-rmse:0.097992+0.006763 
## [561]    train-rmse:0.056844+0.000571    test-rmse:0.097955+0.006818 
## [601]    train-rmse:0.055256+0.000588    test-rmse:0.098003+0.006894 
## [641]    train-rmse:0.053751+0.000568    test-rmse:0.097862+0.006820 
## [681]    train-rmse:0.052389+0.000581    test-rmse:0.097544+0.006810 
## [721]    train-rmse:0.051105+0.000556    test-rmse:0.097534+0.006840 
## [761]    train-rmse:0.049810+0.000529    test-rmse:0.097546+0.006883 
## [801]    train-rmse:0.048620+0.000525    test-rmse:0.097644+0.007005 
## Stopping. Best iteration:
## [758]    train-rmse:0.049913+0.000521    test-rmse:0.097485+0.006865
## 
## [1]  train-rmse:10.384172+0.003062   test-rmse:10.384141+0.029251 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## [41] train-rmse:0.204862+0.000978    test-rmse:0.212259+0.015847 
## [81] train-rmse:0.100322+0.000475    test-rmse:0.117955+0.009930 
## [121]    train-rmse:0.089438+0.000576    test-rmse:0.109735+0.007324 
## [161]    train-rmse:0.082897+0.000700    test-rmse:0.105952+0.006388 
## [201]    train-rmse:0.078329+0.000702    test-rmse:0.103761+0.006131 
## [241]    train-rmse:0.074714+0.000637    test-rmse:0.102176+0.006067 
## [281]    train-rmse:0.071708+0.000665    test-rmse:0.101535+0.006050 
## [321]    train-rmse:0.069045+0.000716    test-rmse:0.101084+0.006074 
## [361]    train-rmse:0.066652+0.000677    test-rmse:0.100495+0.006113 
## [401]    train-rmse:0.064418+0.000681    test-rmse:0.099904+0.006237 
## [441]    train-rmse:0.062345+0.000683    test-rmse:0.099512+0.006175 
## [481]    train-rmse:0.060471+0.000689    test-rmse:0.099489+0.006355 
## [521]    train-rmse:0.058655+0.000635    test-rmse:0.099190+0.006505 
## [561]    train-rmse:0.056969+0.000652    test-rmse:0.098948+0.006441 
## [601]    train-rmse:0.055392+0.000667    test-rmse:0.098898+0.006458 
## Stopping. Best iteration:
## [581]    train-rmse:0.056176+0.000633    test-rmse:0.098830+0.006455
## 
## [1]  train-rmse:10.384512+0.002747   test-rmse:10.384483+0.026557 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## [41] train-rmse:0.205309+0.000881    test-rmse:0.213082+0.008706 
## [81] train-rmse:0.100637+0.000841    test-rmse:0.117861+0.009113 
## [121]    train-rmse:0.089681+0.000611    test-rmse:0.109936+0.008826 
## [161]    train-rmse:0.083206+0.000726    test-rmse:0.105877+0.008533 
## [201]    train-rmse:0.078632+0.000739    test-rmse:0.103923+0.008450 
## [241]    train-rmse:0.074951+0.000772    test-rmse:0.102874+0.008223 
## [281]    train-rmse:0.071819+0.000699    test-rmse:0.101963+0.008052 
## [321]    train-rmse:0.069072+0.000741    test-rmse:0.101217+0.007820 
## [361]    train-rmse:0.066647+0.000716    test-rmse:0.100837+0.007614 
## [401]    train-rmse:0.064411+0.000728    test-rmse:0.100558+0.007385 
## [441]    train-rmse:0.062379+0.000690    test-rmse:0.100093+0.007521 
## [481]    train-rmse:0.060440+0.000647    test-rmse:0.099926+0.007348 
## [521]    train-rmse:0.058651+0.000733    test-rmse:0.099678+0.007285 
## Stopping. Best iteration:
## [506]    train-rmse:0.059294+0.000700    test-rmse:0.099630+0.007353
## 
## [1]  train-rmse:10.384393+0.002276   test-rmse:10.384363+0.022346 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## [41] train-rmse:0.204988+0.000667    test-rmse:0.212765+0.008511 
## [81] train-rmse:0.100581+0.001005    test-rmse:0.117448+0.009703 
## [121]    train-rmse:0.089938+0.001044    test-rmse:0.110242+0.009866 
## [161]    train-rmse:0.083511+0.001094    test-rmse:0.106575+0.009465 
## [201]    train-rmse:0.078858+0.001064    test-rmse:0.104306+0.009334 
## [241]    train-rmse:0.075212+0.001039    test-rmse:0.102854+0.009302 
## [281]    train-rmse:0.072069+0.001091    test-rmse:0.101889+0.009181 
## [321]    train-rmse:0.069282+0.001067    test-rmse:0.101211+0.009047 
## [361]    train-rmse:0.066846+0.000961    test-rmse:0.100713+0.008887 
## [401]    train-rmse:0.064581+0.001008    test-rmse:0.100222+0.008885 
## [441]    train-rmse:0.062606+0.000975    test-rmse:0.100143+0.008889 
## [481]    train-rmse:0.060639+0.000892    test-rmse:0.099903+0.008936 
## [521]    train-rmse:0.058839+0.000889    test-rmse:0.099583+0.009034 
## [561]    train-rmse:0.057129+0.000811    test-rmse:0.099381+0.009047 
## [601]    train-rmse:0.055611+0.000811    test-rmse:0.099208+0.008962 
## [641]    train-rmse:0.054131+0.000772    test-rmse:0.099151+0.009024 
## [681]    train-rmse:0.052735+0.000757    test-rmse:0.099114+0.008979 
## [721]    train-rmse:0.051360+0.000771    test-rmse:0.099065+0.009084 
## [761]    train-rmse:0.050069+0.000788    test-rmse:0.099146+0.009177 
## Stopping. Best iteration:
## [715]    train-rmse:0.051554+0.000768    test-rmse:0.099031+0.009063
## 
## [1]  train-rmse:10.384298+0.002452   test-rmse:10.384253+0.024881 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## [41] train-rmse:0.204999+0.001232    test-rmse:0.212886+0.013278 
## [81] train-rmse:0.100121+0.001300    test-rmse:0.118862+0.010629 
## [121]    train-rmse:0.089526+0.001081    test-rmse:0.111519+0.009659 
## [161]    train-rmse:0.083077+0.000998    test-rmse:0.107928+0.009708 
## [201]    train-rmse:0.078497+0.000937    test-rmse:0.105182+0.009762 
## [241]    train-rmse:0.074842+0.000843    test-rmse:0.103392+0.009939 
## [281]    train-rmse:0.071778+0.000777    test-rmse:0.102624+0.009817 
## [321]    train-rmse:0.068953+0.000857    test-rmse:0.101802+0.009525 
## [361]    train-rmse:0.066485+0.000761    test-rmse:0.101184+0.009445 
## [401]    train-rmse:0.064280+0.000679    test-rmse:0.100570+0.009575 
## [441]    train-rmse:0.062224+0.000665    test-rmse:0.100028+0.009601 
## [481]    train-rmse:0.060352+0.000667    test-rmse:0.099748+0.009517 
## [521]    train-rmse:0.058540+0.000631    test-rmse:0.099463+0.009452 
## [561]    train-rmse:0.056821+0.000581    test-rmse:0.099382+0.009453 
## [601]    train-rmse:0.055258+0.000552    test-rmse:0.099371+0.009428 
## [641]    train-rmse:0.053803+0.000535    test-rmse:0.099115+0.009382 
## [681]    train-rmse:0.052444+0.000490    test-rmse:0.098950+0.009199 
## [721]    train-rmse:0.051084+0.000468    test-rmse:0.098925+0.009364 
## Stopping. Best iteration:
## [688]    train-rmse:0.052213+0.000486    test-rmse:0.098833+0.009243
#averaging them
real.pred <- apply(boot.dat, 1, mean)


#Creating submission
submission <- data.frame(Id = test$Id, SalePrice = real.pred)

submission %>% head
##     Id SalePrice
## 1 1461  118917.9
## 2 1462  162902.6
## 3 1463  185609.8
## 4 1464  195243.1
## 5 1465  198986.4
## 6 1466  168103.9
#write.csv(submission, "C:/Users/husie/Desktop/Kaggle/House Price/final 45.csv", row.names = FALSE)